Added blktap support. Includes kernel driver (enabled as CONFIG_XEN_BLKDEV_TAP=y...
authorjchesterfield@dhcp92.uk.xensource.com <jchesterfield@dhcp92.uk.xensource.com>
Thu, 13 Jul 2006 09:13:26 +0000 (10:13 +0100)
committerjchesterfield@dhcp92.uk.xensource.com <jchesterfield@dhcp92.uk.xensource.com>
Thu, 13 Jul 2006 09:13:26 +0000 (10:13 +0100)
133 files changed:
buildconfigs/linux-defconfig_xen0_x86_32
buildconfigs/linux-defconfig_xen0_x86_64
buildconfigs/linux-defconfig_xen_x86_32
buildconfigs/linux-defconfig_xen_x86_64
linux-2.6-xen-sparse/drivers/xen/Kconfig
linux-2.6-xen-sparse/drivers/xen/Makefile
linux-2.6-xen-sparse/drivers/xen/blktap/Makefile [new file with mode: 0644]
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c [new file with mode: 0644]
linux-2.6-xen-sparse/drivers/xen/blktap/common.h [new file with mode: 0644]
linux-2.6-xen-sparse/drivers/xen/blktap/interface.c [new file with mode: 0644]
linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c [new file with mode: 0644]
patches/linux-2.6.16.13/blktap-aio-16_03_06.patch [new file with mode: 0644]
tools/Makefile
tools/blktap/Makefile [new file with mode: 0644]
tools/blktap/README [new file with mode: 0644]
tools/blktap/drivers/Makefile [new file with mode: 0644]
tools/blktap/drivers/aes.c [new file with mode: 0644]
tools/blktap/drivers/aes.h [new file with mode: 0644]
tools/blktap/drivers/blktapctrl.c [new file with mode: 0644]
tools/blktap/drivers/blktapctrl.h [new file with mode: 0644]
tools/blktap/drivers/block-aio.c [new file with mode: 0644]
tools/blktap/drivers/block-qcow.c [new file with mode: 0644]
tools/blktap/drivers/block-ram.c [new file with mode: 0644]
tools/blktap/drivers/block-sync.c [new file with mode: 0644]
tools/blktap/drivers/block-vmdk.c [new file with mode: 0644]
tools/blktap/drivers/bswap.h [new file with mode: 0644]
tools/blktap/drivers/img2qcow.c [new file with mode: 0644]
tools/blktap/drivers/qcow-create.c [new file with mode: 0644]
tools/blktap/drivers/qcow2raw.c [new file with mode: 0644]
tools/blktap/drivers/tapdisk.c [new file with mode: 0644]
tools/blktap/drivers/tapdisk.h [new file with mode: 0644]
tools/blktap/lib/Makefile [new file with mode: 0644]
tools/blktap/lib/blkif.c [new file with mode: 0644]
tools/blktap/lib/blktaplib.h [new file with mode: 0644]
tools/blktap/lib/list.h [new file with mode: 0644]
tools/blktap/lib/xenbus.c [new file with mode: 0644]
tools/blktap/lib/xs_api.c [new file with mode: 0644]
tools/blktap/lib/xs_api.h [new file with mode: 0644]
tools/examples/Makefile
tools/examples/blktap [new file with mode: 0644]
tools/examples/xen-backend.agent
tools/examples/xen-backend.rules
tools/libaio/COPYING [new file with mode: 0644]
tools/libaio/ChangeLog [new file with mode: 0644]
tools/libaio/INSTALL [new file with mode: 0644]
tools/libaio/Makefile [new file with mode: 0644]
tools/libaio/TODO [new file with mode: 0644]
tools/libaio/harness/Makefile [new file with mode: 0644]
tools/libaio/harness/README [new file with mode: 0644]
tools/libaio/harness/attic/0.t [new file with mode: 0644]
tools/libaio/harness/attic/1.t [new file with mode: 0644]
tools/libaio/harness/cases/10.t [new file with mode: 0644]
tools/libaio/harness/cases/11.t [new file with mode: 0644]
tools/libaio/harness/cases/12.t [new file with mode: 0644]
tools/libaio/harness/cases/13.t [new file with mode: 0644]
tools/libaio/harness/cases/14.t [new file with mode: 0644]
tools/libaio/harness/cases/2.t [new file with mode: 0644]
tools/libaio/harness/cases/3.t [new file with mode: 0644]
tools/libaio/harness/cases/4.t [new file with mode: 0644]
tools/libaio/harness/cases/5.t [new file with mode: 0644]
tools/libaio/harness/cases/6.t [new file with mode: 0644]
tools/libaio/harness/cases/7.t [new file with mode: 0644]
tools/libaio/harness/cases/8.t [new file with mode: 0644]
tools/libaio/harness/cases/aio_setup.h [new file with mode: 0644]
tools/libaio/harness/cases/common-7-8.h [new file with mode: 0644]
tools/libaio/harness/main.c [new file with mode: 0644]
tools/libaio/harness/runtests.sh [new file with mode: 0644]
tools/libaio/libaio.spec [new file with mode: 0644]
tools/libaio/man/aio.3 [new file with mode: 0644]
tools/libaio/man/aio_cancel.3 [new file with mode: 0644]
tools/libaio/man/aio_cancel64.3 [new file with mode: 0644]
tools/libaio/man/aio_error.3 [new file with mode: 0644]
tools/libaio/man/aio_error64.3 [new file with mode: 0644]
tools/libaio/man/aio_fsync.3 [new file with mode: 0644]
tools/libaio/man/aio_fsync64.3 [new file with mode: 0644]
tools/libaio/man/aio_init.3 [new file with mode: 0644]
tools/libaio/man/aio_read.3 [new file with mode: 0644]
tools/libaio/man/aio_read64.3 [new file with mode: 0644]
tools/libaio/man/aio_return.3 [new file with mode: 0644]
tools/libaio/man/aio_return64.3 [new file with mode: 0644]
tools/libaio/man/aio_suspend.3 [new file with mode: 0644]
tools/libaio/man/aio_suspend64.3 [new file with mode: 0644]
tools/libaio/man/aio_write.3 [new file with mode: 0644]
tools/libaio/man/aio_write64.3 [new file with mode: 0644]
tools/libaio/man/io.3 [new file with mode: 0644]
tools/libaio/man/io_cancel.1 [new file with mode: 0644]
tools/libaio/man/io_cancel.3 [new file with mode: 0644]
tools/libaio/man/io_destroy.1 [new file with mode: 0644]
tools/libaio/man/io_fsync.3 [new file with mode: 0644]
tools/libaio/man/io_getevents.1 [new file with mode: 0644]
tools/libaio/man/io_getevents.3 [new file with mode: 0644]
tools/libaio/man/io_prep_fsync.3 [new file with mode: 0644]
tools/libaio/man/io_prep_pread.3 [new file with mode: 0644]
tools/libaio/man/io_prep_pwrite.3 [new file with mode: 0644]
tools/libaio/man/io_queue_init.3 [new file with mode: 0644]
tools/libaio/man/io_queue_release.3 [new file with mode: 0644]
tools/libaio/man/io_queue_run.3 [new file with mode: 0644]
tools/libaio/man/io_queue_wait.3 [new file with mode: 0644]
tools/libaio/man/io_set_callback.3 [new file with mode: 0644]
tools/libaio/man/io_setup.1 [new file with mode: 0644]
tools/libaio/man/io_submit.1 [new file with mode: 0644]
tools/libaio/man/io_submit.3 [new file with mode: 0644]
tools/libaio/man/lio_listio.3 [new file with mode: 0644]
tools/libaio/man/lio_listio64.3 [new file with mode: 0644]
tools/libaio/src/Makefile [new file with mode: 0644]
tools/libaio/src/compat-0_1.c [new file with mode: 0644]
tools/libaio/src/io_cancel.c [new file with mode: 0644]
tools/libaio/src/io_destroy.c [new file with mode: 0644]
tools/libaio/src/io_getevents.c [new file with mode: 0644]
tools/libaio/src/io_queue_init.c [new file with mode: 0644]
tools/libaio/src/io_queue_release.c [new file with mode: 0644]
tools/libaio/src/io_queue_run.c [new file with mode: 0644]
tools/libaio/src/io_queue_wait.c [new file with mode: 0644]
tools/libaio/src/io_setup.c [new file with mode: 0644]
tools/libaio/src/io_submit.c [new file with mode: 0644]
tools/libaio/src/libaio.h [new file with mode: 0644]
tools/libaio/src/libaio.map [new file with mode: 0644]
tools/libaio/src/raw_syscall.c [new file with mode: 0644]
tools/libaio/src/syscall-alpha.h [new file with mode: 0644]
tools/libaio/src/syscall-i386.h [new file with mode: 0644]
tools/libaio/src/syscall-ia64.h [new file with mode: 0644]
tools/libaio/src/syscall-ppc.h [new file with mode: 0644]
tools/libaio/src/syscall-s390.h [new file with mode: 0644]
tools/libaio/src/syscall-x86_64.h [new file with mode: 0644]
tools/libaio/src/syscall.h [new file with mode: 0644]
tools/libaio/src/vsys_def.h [new file with mode: 0644]
tools/misc/xend
tools/python/xen/xend/XendDomainInfo.py
tools/python/xen/xend/server/BlktapController.py [new file with mode: 0644]
tools/python/xen/xm/create.py
tools/python/xen/xm/main.py
tools/xenstore/Makefile
xen/common/grant_table.c

index 8d34d2c75b64f41d13aa7a279764ef0a6ac33a4a..3280f54f2cdca4e135bbaa3e5a0191466121e8e7 100644 (file)
@@ -1322,6 +1322,7 @@ CONFIG_XEN_PCIDEV_BACKEND=y
 CONFIG_XEN_PCIDEV_BACKEND_PASS=y
 # CONFIG_XEN_PCIDEV_BE_DEBUG is not set
 CONFIG_XEN_BLKDEV_BACKEND=y
+CONFIG_XEN_BLKDEV_TAP=y
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
 CONFIG_XEN_NETDEV_LOOPBACK=y
index 03727525a4df553fd0dfe1541897e31b59d9c956..c219cd7f3c9c6d6b5b2fbd0c3369f4c6df57ac2c 100644 (file)
@@ -1263,6 +1263,7 @@ CONFIG_XEN_PCIDEV_BACKEND=y
 CONFIG_XEN_PCIDEV_BACKEND_PASS=y
 # CONFIG_XEN_PCIDEV_BE_DEBUG is not set
 CONFIG_XEN_BLKDEV_BACKEND=y
+CONFIG_XEN_BLKDEV_TAP=y
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
 CONFIG_XEN_NETDEV_LOOPBACK=y
index db25f1717cd82f6046998d4aedf03f5fbd6b6c0b..286e47d71e27dace9f793abff77f8dea2cb400a9 100644 (file)
@@ -3023,6 +3023,7 @@ CONFIG_XEN_PCIDEV_BACKEND_VPCI=y
 # CONFIG_XEN_PCIDEV_BACKEND_PASS is not set
 # CONFIG_XEN_PCIDEV_BE_DEBUG is not set
 CONFIG_XEN_BLKDEV_BACKEND=y
+CONFIG_XEN_BLKDEV_TAP=y
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
 CONFIG_XEN_NETDEV_LOOPBACK=y
index b891318b2b1a7db2ecfd4dc5d575e3d3a0055dcb..6299af3b5ee68113eade66fb60d80cb930e3d41b 100644 (file)
@@ -2855,6 +2855,7 @@ CONFIG_XEN_PCIDEV_BACKEND=m
 CONFIG_XEN_PCIDEV_BACKEND_PASS=y
 # CONFIG_XEN_PCIDEV_BE_DEBUG is not set
 CONFIG_XEN_BLKDEV_BACKEND=y
+CONFIG_XEN_BLKDEV_TAP=y
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
 CONFIG_XEN_NETDEV_LOOPBACK=y
index 6124dda0f090ab5a72fe07e97d08343394e33d40..ddc59f91c39f65bb9dc32e23e59af87406ee30f0 100644 (file)
@@ -94,6 +94,18 @@ config XEN_XENBUS_DEV
        depends on PROC_FS
        default y
 
+config XEN_BLKDEV_TAP
+       tristate "Blockk device tap backend"
+       depends on XEN_BACKEND
+       default XEN_PRIVILEGED_GUEST
+       help
+         The block tap driver is an alternative to the block back driver 
+          and allows VM block requests to be redirected to userspace through
+          a device interface.  The tap allows user-space development of 
+          high-performance block backends, where disk images may be implemented
+          as files, in memory, or on other hosts across the network.  This 
+         driver can safely coexist with the existing blockback driver.
+
 config XEN_NETDEV_BACKEND
        tristate "Network-device backend driver"
         depends on XEN_BACKEND && NET
index 6771afeb1c80b5948dcba85392de67fd82065723..79eddeec16a1be0d54d61f1e106a0ae04ed8abbe 100644 (file)
@@ -8,6 +8,7 @@ obj-$(CONFIG_XEN_UTIL)                  += util.o
 obj-$(CONFIG_XEN_BALLOON)              += balloon/
 obj-$(CONFIG_XEN_DEVMEM)               += char/
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += blkback/
+obj-$(CONFIG_XEN_BLKDEV_TAP)           += blktap/
 obj-$(CONFIG_XEN_NETDEV_BACKEND)       += netback/
 obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmback/
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += blkfront/
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/Makefile b/linux-2.6-xen-sparse/drivers/xen/blktap/Makefile
new file mode 100644 (file)
index 0000000..409b078
--- /dev/null
@@ -0,0 +1,3 @@
+LINUXINCLUDE += -I../xen/include/public/io
+obj-y  := xenbus.o interface.o blktap.o 
+
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c
new file mode 100644 (file)
index 0000000..deb9e61
--- /dev/null
@@ -0,0 +1,1439 @@
+/******************************************************************************
+ * drivers/xen/blktap/blktap.c
+ * 
+ * Back-end driver for user level virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. Requests
+ * are remapped to a user-space memory region.
+ *
+ * Based on the blkback driver code.
+ * 
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+#include <xen/balloon.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <asm/tlbflush.h>
+#include <linux/devfs_fs_kernel.h>
+
+#define MAX_TAP_DEV 100     /*the maximum number of tapdisk ring devices    */
+#define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by 
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define MAX_DYNAMIC_MEM 64
+#define MAX_PENDING_REQS 64   
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg)                                   \
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+static int blkif_reqs = MAX_PENDING_REQS;
+static int mmap_pages = MMAP_PAGES;
+
+#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
+                     * have a bunch of pages reserved for shared
+                     * memory rings.
+                     */
+
+/*Data struct associated with each of the tapdisk devices*/
+typedef struct tap_blkif {
+       struct vm_area_struct *vma;   /*Shared memory area                   */
+       unsigned long rings_vstart;   /*Kernel memory mapping                */
+       unsigned long user_vstart;    /*User memory mapping                  */
+       unsigned long dev_inuse;      /*One process opens device at a time.  */
+       unsigned long dev_pending;    /*In process of being opened           */
+       unsigned long ring_ok;        /*make this ring->state                */
+       blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
+       wait_queue_head_t wait;       /*for poll                             */
+       unsigned long mode;           /*current switching mode               */
+       int minor;                    /*Minor number for tapdisk device      */
+       pid_t pid;                    /*tapdisk process id                   */
+       enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
+                                                 shutdown                   */
+       unsigned long *idx_map;       /*Record the user ring id to kern 
+                                       [req id, idx] tuple                  */
+       blkif_t *blkif;               /*Associate blkif with tapdev          */
+} tap_blkif_t;
+
+/*Private data struct associated with the inode*/
+typedef struct private_info {
+       int idx;
+} private_info_t;
+
+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
+typedef struct domid_translate {
+       unsigned short domid;
+       unsigned short busid;
+} domid_translate_t ;
+
+
+domid_translate_t  translate_domid[MAX_TAP_DEV];
+tap_blkif_t *tapfds[MAX_TAP_DEV];
+
+static int __init set_blkif_reqs(char *str)
+{
+       get_option(&str, &blkif_reqs);
+       return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
+/* Run-time switchable: /sys/module/blktap/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
+ * the pendcnt towards zero. When it hits zero, the specified domain has a 
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+       blkif_t       *blkif;
+       unsigned long  id;
+       unsigned short mem_idx;
+       int            nr_pages;
+       atomic_t       pendcnt;
+       unsigned short operation;
+       int            status;
+       struct list_head free_list;
+       int            inuse;
+} pending_req_t;
+
+static pending_req_t *pending_reqs[MAX_PENDING_REQS];
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
+static int alloc_pending_reqs;
+
+typedef unsigned int PEND_RING_IDX;
+
+static inline int MASK_PEND_IDX(int i) { 
+       return (i & (MAX_PENDING_REQS-1)); 
+}
+
+static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
+       return (req - pending_reqs[idx]);
+}
+
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+typedef struct mmap_page {
+       unsigned long start;
+       struct page *mpage;
+} mmap_page_t;
+
+static mmap_page_t mmap_start[MAX_DYNAMIC_MEM];
+static unsigned short mmap_alloc = 0;
+static unsigned short mmap_lock = 0;
+static unsigned short mmap_inuse = 0;
+static unsigned long *pending_addrs[MAX_DYNAMIC_MEM];
+
+/******************************************************************
+ * GRANT HANDLES
+ */
+
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+        grant_handle_t kernel;
+        grant_handle_t user;
+};
+
+static struct grant_handle_pair 
+    pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
+#define pending_handle(_id, _idx, _i) \
+    (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
+    + (_i)])
+
+
+static int blktap_read_ufe_ring(int idx); /*local prototypes*/
+
+#define BLKTAP_MINOR 0  /*/dev/xen/blktap resides at device number
+                         major=254, minor numbers begin at 0            */ 
+#define BLKTAP_DEV_MAJOR 254         /* TODO: Make major number dynamic  *
+                                      * and create devices in the kernel *
+                                     */
+#define BLKTAP_DEV_DIR  "/dev/xen"
+
+/* blktap IOCTLs: */
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
+#define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_SENDPID        4
+#define BLKTAP_IOCTL_NEWINTF        5
+#define BLKTAP_IOCTL_MINOR          6
+#define BLKTAP_IOCTL_MAJOR          7
+#define BLKTAP_QUERY_ALLOC_REQS      8
+#define BLKTAP_IOCTL_FREEINTF        9
+#define BLKTAP_IOCTL_PRINT_IDXS      100  
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+       return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
+               (arg == BLKTAP_MODE_INTERCEPT_FE) ||
+                (arg == BLKTAP_MODE_INTERPOSE   ));
+}
+
+/* Requests passing through the tap to userspace are re-assigned an ID.
+ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
+ * ring ID. 
+ */
+
+static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
+{
+        return ((fe_dom << 16) | MASK_PEND_IDX(idx));
+}
+
+extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
+{
+        return (PEND_RING_IDX)(id & 0x0000ffff);
+}
+
+extern inline int ID_TO_MIDX(unsigned long id)
+{
+        return (int)(id >> 16);
+}
+
+#define INVALID_REQ 0xdead0000
+
+/*TODO: Convert to a free list*/
+static inline int GET_NEXT_REQ(unsigned long *idx_map)
+{
+       int i;
+       for (i = 0; i < MAX_PENDING_REQS; i++)
+               if (idx_map[i] == INVALID_REQ) return i;
+
+       return INVALID_REQ;
+}
+
+
+#define BLKTAP_INVALID_HANDLE(_g) \
+    (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
+
+#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
+    (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
+    } while(0)
+
+
+/******************************************************************
+ * BLKTAP VM OPS
+ */
+
+static struct page *blktap_nopage(struct vm_area_struct *vma,
+                                 unsigned long address,
+                                 int *type)
+{
+       /*
+        * if the page has not been mapped in by the driver then return
+        * NOPAGE_SIGBUS to the domain.
+        */
+
+       return NOPAGE_SIGBUS;
+}
+
+struct vm_operations_struct blktap_vm_ops = {
+       nopage:   blktap_nopage,
+};
+
+/******************************************************************
+ * BLKTAP FILE OPS
+ */
+/*Function Declarations*/
+static int get_next_free_dev(void);
+static int blktap_open(struct inode *inode, struct file *filp);
+static int blktap_release(struct inode *inode, struct file *filp);
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+                        unsigned int cmd, unsigned long arg);
+static unsigned int blktap_poll(struct file *file, poll_table *wait);
+
+struct miscdevice *set_misc(int minor, char *name, int dev);
+
+static struct file_operations blktap_fops = {
+       .owner   = THIS_MODULE,
+       .poll    = blktap_poll,
+       .ioctl   = blktap_ioctl,
+       .open    = blktap_open,
+       .release = blktap_release,
+       .mmap    = blktap_mmap,
+};
+
+
+static int get_next_free_dev(void)
+{
+       tap_blkif_t *info;
+       int i = 0, ret = -1;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       
+       while (i < MAX_TAP_DEV) {
+               info = tapfds[i];
+               if ( (tapfds[i] != NULL) && (info->dev_inuse == 0)
+                       && (info->dev_pending == 0) ) {
+                       info->dev_pending = 1;
+                       ret = i;
+                       goto done;
+               }
+               i++;
+       }
+       
+done:
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       return ret;
+}
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
+{
+       int i;
+               
+       for (i = 0; i < MAX_TAP_DEV; i++)
+               if ( (translate_domid[i].domid == domid)
+                   && (translate_domid[i].busid == xenbus_id) ) {
+                       tapfds[i]->blkif = blkif;
+                       tapfds[i]->status = RUNNING;
+                       return i;
+               }
+       return -1;
+}
+
+void signal_tapdisk(int idx) 
+{
+       tap_blkif_t *info;
+       struct task_struct *ptask;
+
+       info = tapfds[idx];
+       if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) {
+               ptask = find_task_by_pid(info->pid);
+               if (ptask) { 
+                       info->status = CLEANSHUTDOWN;
+               }
+       }
+       info->blkif = NULL;
+       return;
+}
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+       blkif_sring_t *sring;
+       int idx = iminor(inode) - BLKTAP_MINOR;
+       tap_blkif_t *info;
+       private_info_t *prv;
+       int i;
+       
+       if (tapfds[idx] == NULL) {
+               WPRINTK("Unable to open device /dev/xen/blktap%d\n",
+                      idx);
+               return -ENOMEM;
+       }
+       DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
+       
+       info = tapfds[idx];
+       
+       /*Only one process can access device at a time*/
+       if (test_and_set_bit(0, &info->dev_inuse))
+               return -EBUSY;
+
+       info->dev_pending = 0;
+           
+       /* Allocate the fe ring. */
+       sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+       if (sring == NULL)
+               goto fail_nomem;
+
+       SetPageReserved(virt_to_page(sring));
+    
+       SHARED_RING_INIT(sring);
+       FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
+       
+       prv = kzalloc(sizeof(private_info_t),GFP_KERNEL);
+       prv->idx = idx;
+       filp->private_data = prv;
+       info->vma = NULL;
+
+       info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
+                               GFP_KERNEL);
+       
+       if (idx > 0) {
+               init_waitqueue_head(&info->wait);
+               for (i = 0; i < MAX_PENDING_REQS; i++) 
+                       info->idx_map[i] = INVALID_REQ;
+       }
+
+       DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
+       return 0;
+
+ fail_nomem:
+       return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+       int idx = iminor(inode) - BLKTAP_MINOR;
+       tap_blkif_t *info;
+       
+       if (tapfds[idx] == NULL) {
+               WPRINTK("Trying to free device that doesn't exist "
+                      "[/dev/xen/blktap%d]\n",idx);
+               return -1;
+       }
+       info = tapfds[idx];
+       info->dev_inuse = 0;
+       DPRINTK("Freeing device [/dev/xen/blktap%d]\n",idx);
+
+       /* Free the ring page. */
+       ClearPageReserved(virt_to_page(info->ufe_ring.sring));
+       free_page((unsigned long) info->ufe_ring.sring);
+
+       /* Clear any active mappings and free foreign map table */
+       if (info->vma) {
+               zap_page_range(
+                       info->vma, info->vma->vm_start, 
+                       info->vma->vm_end - info->vma->vm_start, NULL);
+               info->vma = NULL;
+       }
+       
+       if (filp->private_data) kfree(filp->private_data);
+
+       if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
+               kthread_stop(info->blkif->xenblkd);
+               info->blkif->xenblkd = NULL;
+               info->status = CLEANSHUTDOWN;
+       }       
+       return 0;
+}
+
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them.  This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a 
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space.  This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
+ * from pages to actual page structs.  There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       int size;
+       struct page **map;
+       int i;
+       private_info_t *prv;
+       tap_blkif_t *info;
+
+       /*Retrieve the dev info*/
+       prv = (private_info_t *)filp->private_data;
+       if (prv == NULL) {
+               WPRINTK("blktap: mmap, retrieving idx failed\n");
+               return -ENOMEM;
+       }
+       info = tapfds[prv->idx];
+       
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &blktap_vm_ops;
+
+       size = vma->vm_end - vma->vm_start;
+       if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
+               WPRINTK("you _must_ map exactly %d pages!\n",
+                      mmap_pages + RING_PAGES);
+               return -EAGAIN;
+       }
+
+       size >>= PAGE_SHIFT;
+       info->rings_vstart = vma->vm_start;
+       info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
+    
+       /* Map the ring pages to the start of the region and reserve it. */
+       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+       if (remap_pfn_range(vma, vma->vm_start, 
+                           __pa(info->ufe_ring.sring) >> PAGE_SHIFT, 
+                           PAGE_SIZE, vma->vm_page_prot)) {
+               WPRINTK("Mapping user ring failed!\n");
+               goto fail;
+       }
+
+       /* Mark this VM as containing foreign pages, and set up mappings. */
+       map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
+                     * sizeof(struct page_struct*),
+                     GFP_KERNEL);
+       if (map == NULL) {
+               WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
+               goto fail;
+       }
+
+       for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
+               map[i] = NULL;
+    
+       vma->vm_private_data = map;
+       vma->vm_flags |= VM_FOREIGN;
+
+       info->vma = vma;
+       info->ring_ok = 1;
+       return 0;
+ fail:
+       /* Clear any active mappings. */
+       zap_page_range(vma, vma->vm_start, 
+                      vma->vm_end - vma->vm_start, NULL);
+
+       return -ENOMEM;
+}
+
+
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+                        unsigned int cmd, unsigned long arg)
+{
+       int idx = iminor(inode) - BLKTAP_MINOR;
+       switch(cmd) {
+       case BLKTAP_IOCTL_KICK_FE: 
+       {
+               /* There are fe messages to process. */
+               return blktap_read_ufe_ring(idx);
+       }
+       case BLKTAP_IOCTL_SETMODE:
+       {
+               tap_blkif_t *info = tapfds[idx];
+               
+               if ( (idx > 0) && (idx < MAX_TAP_DEV) 
+                    && (tapfds[idx] != NULL) ) 
+               {
+                       if (BLKTAP_MODE_VALID(arg)) {
+                               info->mode = arg;
+                               /* XXX: may need to flush rings here. */
+                               DPRINTK("blktap: set mode to %lx\n", 
+                                      arg);
+                               return 0;
+                       }
+               }
+               return 0;
+       }
+       case BLKTAP_IOCTL_PRINT_IDXS:
+        {
+               tap_blkif_t *info = tapfds[idx];
+               
+               if ( (idx > 0) && (idx < MAX_TAP_DEV) 
+                    && (tapfds[idx] != NULL) ) 
+               {
+                       printk("User Rings: \n-----------\n");
+                       printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
+                               "| req_prod: %2d, rsp_prod: %2d\n",
+                               info->ufe_ring.rsp_cons,
+                               info->ufe_ring.req_prod_pvt,
+                               info->ufe_ring.sring->req_prod,
+                               info->ufe_ring.sring->rsp_prod);
+               }
+               return 0;
+        }
+       case BLKTAP_IOCTL_SENDPID:
+       {
+               tap_blkif_t *info = tapfds[idx];
+               
+               if ( (idx > 0) && (idx < MAX_TAP_DEV) 
+                    && (tapfds[idx] != NULL) ) 
+               {
+                       info->pid = (pid_t)arg;
+                       DPRINTK("blktap: pid received %d\n", 
+                              info->pid);
+               }
+               return 0;
+       }
+       case BLKTAP_IOCTL_NEWINTF:
+       {               
+               uint64_t val = (uint64_t)arg;
+               domid_translate_t *tr = (domid_translate_t *)&val;
+               int newdev;
+
+               DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
+                      tr->domid, tr->busid);
+               newdev = get_next_free_dev();
+               if (newdev < 1) {
+                       WPRINTK("Error initialising /dev/xen/blktap - "
+                               "No more devices\n");
+                       return -1;
+               }
+               translate_domid[newdev].domid = tr->domid;
+               translate_domid[newdev].busid = tr->busid;
+               return newdev;
+       }
+       case BLKTAP_IOCTL_FREEINTF:
+       {
+               unsigned long dev = arg;
+               tap_blkif_t *info = NULL;
+
+               if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev];
+
+               if ( (info != NULL) && (info->dev_pending) )
+                       info->dev_pending = 0;
+               return 0;
+       }
+       case BLKTAP_IOCTL_MINOR:
+       {
+               unsigned long dev = arg;
+               tap_blkif_t *info = NULL;
+               
+               if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev];
+               
+               if (info != NULL) return info->minor;
+               else return -1;
+       }
+       case BLKTAP_IOCTL_MAJOR:
+               return BLKTAP_DEV_MAJOR;
+
+       case BLKTAP_QUERY_ALLOC_REQS:
+       {
+               WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
+                      alloc_pending_reqs, blkif_reqs);
+               return (alloc_pending_reqs/blkif_reqs) * 100;
+       }
+       }
+       return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_poll(struct file *file, poll_table *wait)
+{
+       private_info_t *prv;
+       tap_blkif_t *info;
+       
+       /*Retrieve the dev info*/
+       prv = (private_info_t *)file->private_data;
+       if (prv == NULL) {
+               WPRINTK(" poll, retrieving idx failed\n");
+               return 0;
+       }
+       
+       if (prv->idx == 0) return 0;
+       
+       info = tapfds[prv->idx];
+       
+       poll_wait(file, &info->wait, wait);
+       if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
+               flush_tlb_all();
+               RING_PUSH_REQUESTS(&info->ufe_ring);
+               return POLLIN | POLLRDNORM;
+       }
+       return 0;
+}
+
+void blktap_kick_user(int idx)
+{
+       tap_blkif_t *info;
+
+       if (idx == 0) return;
+       
+       info = tapfds[idx];
+       
+       if (info != NULL) wake_up_interruptible(&info->wait);
+       return;
+}
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+/* FIXME: Return ENOMEM properly on failure to allocate additional reqs. */
+static void req_increase(void)
+{
+       int i, j;
+       struct page *page;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+
+       if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
+               goto done;
+
+       pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t) *
+                                       blkif_reqs, GFP_KERNEL);
+       pending_addrs[mmap_alloc] = kzalloc(sizeof(unsigned long) *
+                                       mmap_pages, GFP_KERNEL);
+
+       if (!pending_reqs[mmap_alloc] || !pending_addrs[mmap_alloc]) {
+               kfree(pending_reqs[mmap_alloc]);
+               kfree(pending_addrs[mmap_alloc]);
+               WPRINTK("%s: out of memory\n", __FUNCTION__); 
+               goto done;
+       }
+
+#ifdef __ia64__
+       extern unsigned long alloc_empty_foreign_map_page_range(
+               unsigned long pages);
+       mmap_start[mmap_alloc].start = (unsigned long)
+               alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+       page = balloon_alloc_empty_page_range(mmap_pages);
+       BUG_ON(page == NULL);
+
+       /* Pin all of the pages. */
+       for (i=0; i<mmap_pages; i++)
+               get_page(&page[i]);
+
+       mmap_start[mmap_alloc].start = 
+               (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+       mmap_start[mmap_alloc].mpage = page;
+
+#endif
+       DPRINTK("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+               __FUNCTION__, blkif_reqs, mmap_pages, 
+              mmap_start[mmap_alloc].start);
+
+       BUG_ON(mmap_start[mmap_alloc].start == 0);
+
+       for (i = 0; i < mmap_pages; i++) 
+               pending_addrs[mmap_alloc][i] = 
+                       mmap_start[mmap_alloc].start + (i << PAGE_SHIFT);
+
+       for (i = 0; i < MAX_PENDING_REQS ; i++) {
+               list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
+                             &pending_free);
+               pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
+               for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+                       BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
+                                                                i, j));
+       }
+
+       mmap_alloc++;
+       DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
+ done:
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+
+}
+
+static void mmap_req_del(int mmap)
+{
+       int i;
+       struct page *page;
+
+       /*Spinlock already acquired*/
+       kfree(pending_reqs[mmap]);
+       kfree(pending_addrs[mmap]);
+
+#ifdef __ia64__
+       /*Not sure what goes here yet!*/
+#else
+
+       /* Unpin all of the pages. */
+       page = mmap_start[mmap].mpage;
+       for (i=0; i<mmap_pages; i++)
+               put_page(&page[i]);
+
+       balloon_dealloc_empty_page_range(mmap_start[mmap].mpage, mmap_pages);
+#endif
+
+       mmap_lock = 0;
+       DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
+       mmap_alloc--;
+}
+
+/*N.B. Currently unused - will be accessed via sysfs*/
+static void req_decrease(void)
+{
+       pending_req_t *req;
+       int i;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+
+       DPRINTK("Req decrease called.\n");
+       if (mmap_lock || mmap_alloc == 1) 
+               goto done;
+
+       mmap_lock = 1;
+       mmap_inuse = MAX_PENDING_REQS;
+       
+        /*Go through reqs and remove any that aren't in use*/
+       for (i = 0; i < MAX_PENDING_REQS ; i++) {
+               req = &pending_reqs[mmap_alloc-1][i];
+               if (req->inuse == 0) {
+                       list_del(&req->free_list);
+                       mmap_inuse--;
+               }
+       }
+       if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
+ done:
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       return;
+}
+
+static pending_req_t* alloc_req(void)
+{
+       pending_req_t *req = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+
+       if (!list_empty(&pending_free)) {
+               req = list_entry(pending_free.next, pending_req_t, free_list);
+               list_del(&req->free_list);
+       }
+
+       if (req) {
+               req->inuse = 1;
+               alloc_pending_reqs++;
+       }
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+
+       return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+       unsigned long flags;
+       int was_empty;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+
+       alloc_pending_reqs--;
+       req->inuse = 0;
+       if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
+               mmap_inuse--;
+               if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
+               spin_unlock_irqrestore(&pending_free_lock, flags);
+               return;
+       }
+       was_empty = list_empty(&pending_free);
+       list_add(&req->free_list, &pending_free);
+
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+
+       if (was_empty)
+               wake_up(&pending_free_wq);
+}
+
+static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int 
+                           tapidx)
+{
+       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+       unsigned int i, invcount = 0;
+       struct grant_handle_pair *khandle;
+       uint64_t ptep;
+       int ret, mmap_idx;
+       unsigned long kvaddr, uvaddr;
+
+       tap_blkif_t *info = tapfds[tapidx];
+       
+       if (info == NULL) {
+               WPRINTK("fast_flush: Couldn't get info!\n");
+               return;
+       }
+       mmap_idx = req->mem_idx;
+
+       for (i = 0; i < req->nr_pages; i++) {
+               kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i);
+               uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
+
+               khandle = &pending_handle(mmap_idx, k_idx, i);
+               if (BLKTAP_INVALID_HANDLE(khandle)) {
+                       WPRINTK("BLKTAP_INVALID_HANDLE\n");
+                       continue;
+               }
+               gnttab_set_unmap_op(&unmap[invcount], 
+                       MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i), 
+                                   GNTMAP_host_map, khandle->kernel);
+               invcount++;
+
+               if (create_lookup_pte_addr(
+                   info->vma->vm_mm,
+                   MMAP_VADDR(info->user_vstart, u_idx, i), 
+                   &ptep) !=0) {
+                       WPRINTK("Couldn't get a pte addr!\n");
+                       return;
+               }
+
+               gnttab_set_unmap_op(&unmap[invcount], 
+                       ptep, GNTMAP_host_map,
+                       khandle->user);
+               invcount++;
+            
+               BLKTAP_INVALIDATE_HANDLE(khandle);
+       }
+       ret = HYPERVISOR_grant_table_op(
+               GNTTABOP_unmap_grant_ref, unmap, invcount);
+       BUG_ON(ret);
+       
+       if (info->vma != NULL)
+               zap_page_range(info->vma, 
+                              MMAP_VADDR(info->user_vstart, u_idx, 0), 
+                              req->nr_pages << PAGE_SHIFT, NULL);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
+              current->comm, blkif->st_oo_req,
+              blkif->st_rd_req, blkif->st_wr_req);
+       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+       blkif->st_rd_req = 0;
+       blkif->st_wr_req = 0;
+       blkif->st_oo_req = 0;
+}
+
+int tap_blkif_schedule(void *arg)
+{
+       blkif_t *blkif = arg;
+
+       blkif_get(blkif);
+
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: started\n", current->comm);
+
+       while (!kthread_should_stop()) {
+               wait_event_interruptible(
+                       blkif->wq,
+                       blkif->waiting_reqs || kthread_should_stop());
+               wait_event_interruptible(
+                       pending_free_wq,
+                       !list_empty(&pending_free) || kthread_should_stop());
+
+               blkif->waiting_reqs = 0;
+               smp_mb(); /* clear flag *before* checking for work */
+
+               if (do_block_io_op(blkif))
+                       blkif->waiting_reqs = 1;
+
+               if (log_stats && time_after(jiffies, blkif->st_print))
+                       print_stats(blkif);
+       }
+
+       if (log_stats)
+               print_stats(blkif);
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+       blkif->xenblkd = NULL;
+       blkif_put(blkif);
+
+       return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called by user level ioctl()
+ */
+
+static int blktap_read_ufe_ring(int idx)
+{
+       /* This is called to read responses from the UFE ring. */
+       RING_IDX i, j, rp;
+       blkif_response_t *resp;
+       blkif_t *blkif=NULL;
+       int pending_idx, usr_idx, mmap_idx;
+       pending_req_t *pending_req;
+       tap_blkif_t *info;
+       
+       info = tapfds[idx];
+       if (info == NULL) {
+               return 0;
+       }
+
+       /* We currently only forward packets in INTERCEPT_FE mode. */
+       if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
+               return 0;
+
+       /* for each outstanding message on the UFEring  */
+       rp = info->ufe_ring.sring->rsp_prod;
+       rmb();
+        
+       for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
+               resp = RING_GET_RESPONSE(&info->ufe_ring, i);
+               ++info->ufe_ring.rsp_cons;
+
+               /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
+               usr_idx = (int)resp->id;
+               pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
+               mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
+
+               if ( (mmap_idx >= mmap_alloc) || 
+                  (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
+                       WPRINTK("Incorrect req map"
+                              "[%d], internal map [%d,%d (%d)]\n", 
+                              usr_idx, mmap_idx, 
+                              ID_TO_IDX(info->idx_map[usr_idx]),
+                              MASK_PEND_IDX(
+                                      ID_TO_IDX(info->idx_map[usr_idx])));
+
+               pending_req = &pending_reqs[mmap_idx][pending_idx];
+               blkif = pending_req->blkif;
+
+               for (j = 0; j < pending_req->nr_pages; j++) {
+
+                       unsigned long kvaddr, uvaddr;
+                       struct page **map = info->vma->vm_private_data;
+                       struct page *pg;
+                       int offset; 
+
+                       uvaddr  = MMAP_VADDR(info->user_vstart, usr_idx, j);
+                       kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, 
+                                           pending_idx, j);
+
+                       pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+                       ClearPageReserved(pg);
+                       offset = (uvaddr - info->vma->vm_start) 
+                               >> PAGE_SHIFT;
+                       map[offset] = NULL;
+               }
+               fast_flush_area(pending_req, pending_idx, usr_idx, idx); 
+               make_response(blkif, pending_req->id, resp->operation,
+                             resp->status);
+               info->idx_map[usr_idx] = INVALID_REQ;
+               blkif_put(pending_req->blkif);
+               free_req(pending_req);
+       }
+               
+       return 0;
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+       blkif->waiting_reqs = 1;
+       wake_up(&blkif->wq);
+}
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+       blkif_notify_work(dev_id);
+       return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+static int print_dbug = 1;
+static int do_block_io_op(blkif_t *blkif)
+{
+       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+       blkif_request_t *req;
+       pending_req_t *pending_req;
+       RING_IDX rc, rp;
+       int more_to_do = 0;
+       tap_blkif_t *info;
+
+       rc = blk_ring->req_cons;
+       rp = blk_ring->sring->req_prod;
+       rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+       /*Check blkif has corresponding UE ring*/
+       if (blkif->dev_num == -1) {
+               /*oops*/
+               if (print_dbug) {
+                       WPRINTK("Corresponding UE " 
+                              "ring does not exist!\n");
+                       print_dbug = 0; /*We only print this message once*/
+               }
+               return 1; 
+       }
+
+       info = tapfds[blkif->dev_num];
+       if (info == NULL || !info->dev_inuse) {
+               if (print_dbug) {
+                       WPRINTK("Can't get UE info!\n");
+                       print_dbug = 0;
+               }
+               return 1;
+       }
+
+       while (rc != rp) {
+               
+               if (RING_FULL(&info->ufe_ring)) {
+                       WPRINTK("RING_FULL! More to do\n");
+                       more_to_do = 1;
+                       break;
+               }
+               
+               if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
+                       WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
+                              " More to do\n");
+                       more_to_do = 1;
+                       break;          
+               }
+
+               pending_req = alloc_req();
+               if (NULL == pending_req) {
+                       blkif->st_oo_req++;
+                       more_to_do = 1;
+                       break;
+               }
+
+               req = RING_GET_REQUEST(blk_ring, rc);
+               blk_ring->req_cons = ++rc; /* before make_response() */ 
+
+               switch (req->operation) {
+               case BLKIF_OP_READ:
+                       blkif->st_rd_req++;
+                       dispatch_rw_block_io(blkif, req, pending_req);
+                       break;
+
+               case BLKIF_OP_WRITE:
+                       blkif->st_wr_req++;
+                       dispatch_rw_block_io(blkif, req, pending_req);
+                       break;
+
+               default:
+                       WPRINTK("unknown operation [%d]\n",
+                               req->operation);
+                       make_response(blkif, req->id, req->operation,
+                                     BLKIF_RSP_ERROR);
+                       free_req(pending_req);
+                       break;
+               }
+       }
+               
+       blktap_kick_user(blkif->dev_num);
+
+       return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req)
+{
+       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
+       int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
+       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+       unsigned int nseg;
+       int ret, i;
+       tap_blkif_t *info = tapfds[blkif->dev_num];
+       uint64_t sector;
+       
+       blkif_request_t *target;
+       int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
+       int usr_idx = GET_NEXT_REQ(info->idx_map);
+       uint16_t mmap_idx = pending_req->mem_idx;
+
+       /*Check we have space on user ring - should never fail*/
+       if(usr_idx == INVALID_REQ) goto fail_flush;
+       
+       /* Check that number of segments is sane. */
+       nseg = req->nr_segments;
+       if ( unlikely(nseg == 0) || 
+           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
+               WPRINTK("Bad number of segments in request (%d)\n", nseg);
+               goto fail_response;
+       }
+       
+       /* Make sure userspace is ready. */
+       if (!info->ring_ok) {
+               WPRINTK("blktap: ring not ready for requests!\n");
+               goto fail_response;
+       }
+
+       if (RING_FULL(&info->ufe_ring)) {
+               WPRINTK("blktap: fe_ring is full, can't add "
+                       "IO Request will be dropped. %d %d\n",
+                       RING_SIZE(&info->ufe_ring),
+                       RING_SIZE(&blkif->blk_ring));
+               goto fail_response;
+       }
+
+       pending_req->blkif     = blkif;
+       pending_req->id        = req->id;
+       pending_req->operation = operation;
+       pending_req->status    = BLKIF_RSP_OKAY;
+       pending_req->nr_pages  = nseg;
+       op = 0;
+       for (i = 0; i < nseg; i++) {
+               unsigned long uvaddr;
+               unsigned long kvaddr;
+               uint64_t ptep;
+               struct page *page;
+               uint32_t flags;
+
+               uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
+               kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, 
+                                   pending_idx, i);
+               page = virt_to_page(kvaddr);
+
+               sector = req->sector_number + (8*i);
+               if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
+                       WPRINTK("BLKTAP: Sector request greater" 
+                              "than size\n");
+                       WPRINTK("BLKTAP: %s request sector" 
+                              "[%llu,%llu], Total [%llu]\n",
+                              (req->operation == 
+                               BLKIF_OP_WRITE ? "WRITE" : "READ"),
+                               (long long unsigned) sector,
+                               (long long unsigned) sector>>9,
+                               blkif->sectors);
+               }
+
+               flags = GNTMAP_host_map;
+               if (operation == WRITE)
+                       flags |= GNTMAP_readonly;
+               gnttab_set_map_op(&map[op], kvaddr, flags,
+                                 req->seg[i].gref, blkif->domid);
+               op++;
+
+               /* Now map it to user. */
+               ret = create_lookup_pte_addr(info->vma->vm_mm, 
+                                            uvaddr, &ptep);
+               if (ret) {
+                       WPRINTK("Couldn't get a pte addr!\n");
+                       fast_flush_area(pending_req, pending_idx, usr_idx, 
+                                       blkif->dev_num);
+                       goto fail_flush;
+               }
+
+               flags = GNTMAP_host_map | GNTMAP_application_map
+                       | GNTMAP_contains_pte;
+               if (operation == WRITE)
+                       flags |= GNTMAP_readonly;
+               gnttab_set_map_op(&map[op], ptep, flags,
+                                 req->seg[i].gref, blkif->domid);
+               op++;
+       }
+
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
+       BUG_ON(ret);
+
+       for (i = 0; i < (nseg*2); i+=2) {
+               unsigned long uvaddr;
+               unsigned long kvaddr;
+               unsigned long offset;
+               struct page *pg;
+
+               uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
+               kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, 
+                                   pending_idx, i/2);
+
+               if (unlikely(map[i].status != 0)) {
+                       WPRINTK("invalid kernel buffer -- "
+                               "could not remap it\n");
+                       goto fail_flush;
+               }
+
+               if (unlikely(map[i+1].status != 0)) {
+                       WPRINTK("invalid user buffer -- "
+                               "could not remap it\n");
+                       goto fail_flush;
+               }
+
+               pending_handle(mmap_idx, pending_idx, i/2).kernel 
+                       = map[i].handle;
+               pending_handle(mmap_idx, pending_idx, i/2).user   
+                       = map[i+1].handle;
+#ifdef CONFIG_XEN_IA64_DOM0_NON_VP
+               pending_addrs[mmap_idx][vaddr_pagenr(pending_req, i)] =
+                       (unsigned long)gnttab_map_vaddr(map[i]);
+#else
+               set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
+                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+#endif
+               offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
+               pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+               ((struct page **)info->vma->vm_private_data)[offset] =
+                       pg;
+       }
+       /* Mark mapped pages as reserved: */
+       for (i = 0; i < req->nr_segments; i++) {
+               unsigned long kvaddr;
+               struct page *pg;
+
+               kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, 
+                                   pending_idx, i);
+               pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+               SetPageReserved(pg);
+       }
+       
+       /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
+       info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
+
+       blkif_get(blkif);
+       /* Finally, write the request message to the user ring. */
+       target = RING_GET_REQUEST(&info->ufe_ring,
+                                 info->ufe_ring.req_prod_pvt);
+       memcpy(target, req, sizeof(*req));
+       target->id = usr_idx;
+       info->ufe_ring.req_prod_pvt++;
+       return;
+
+ fail_flush:
+       WPRINTK("Reached Fail_flush\n");
+       fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
+ fail_response:
+       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+       free_req(pending_req);
+} 
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st)
+{
+       blkif_response_t *resp;
+       unsigned long     flags;
+       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+       int more_to_do = 0;
+       int notify;
+
+       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+       /* Place on the response ring for the relevant domain. */ 
+       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
+       resp->id        = id;
+       resp->operation = op;
+       resp->status    = st;
+       blk_ring->rsp_prod_pvt++;
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
+
+       if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
+               /*
+                * Tail check for pending requests. Allows frontend to avoid
+                * notifications if requests are already in flight (lower
+                * overheads and promotes batching).
+                */
+               RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
+       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
+               more_to_do = 1;
+
+       }       
+       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+       if (more_to_do)
+               blkif_notify_work(blkif);
+       if (notify)
+               notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+       int i,ret,blktap_dir;
+       tap_blkif_t *info;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       INIT_LIST_HEAD(&pending_free);
+        for(i = 0; i < 2; i++) req_increase();
+
+       tap_blkif_interface_init();
+
+       alloc_pending_reqs = 0;
+
+       tap_blkif_xenbus_init();
+
+       /*Create the blktap devices, but do not map memory or waitqueue*/
+       for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF;
+
+       ret = register_chrdev(BLKTAP_DEV_MAJOR,"blktap",&blktap_fops);
+       blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
+
+       if ( (ret < 0)||(blktap_dir < 0) ) {
+               WPRINTK("Couldn't register /dev/xen/blktap\n");
+               return -ENOMEM;
+       }       
+       
+       for(i = 0; i < MAX_TAP_DEV; i++ ) {
+               info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL);
+               if(tapfds[i] == NULL) return -ENOMEM;
+               info->minor = i;
+               info->pid = 0;
+               info->blkif = NULL;
+
+               ret = devfs_mk_cdev(MKDEV(BLKTAP_DEV_MAJOR, i),
+                       S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
+
+               if(ret != 0) return -ENOMEM;
+               info->dev_pending = info->dev_inuse = 0;
+
+               DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
+       }
+       
+       DPRINTK("Blktap device successfully created\n");
+
+       return 0;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/common.h b/linux-2.6-xen-sparse/drivers/xen/blktap/common.h
new file mode 100644 (file)
index 0000000..975674f
--- /dev/null
@@ -0,0 +1,120 @@
+/* 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/ring.h>
+#include <xen/gnttab.h>
+#include <xen/driver_util.h>
+
+#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
+                                    __FILE__ , __LINE__ , ## _a )
+
+#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
+
+struct backend_info; 
+
+typedef struct blkif_st {
+       /* Unique identifier for this interface. */
+       domid_t           domid;
+       unsigned int      handle;
+       /* Physical parameters of the comms window. */
+       unsigned int      evtchn;
+       unsigned int      irq;
+       /* Comms information. */
+       blkif_back_ring_t blk_ring;
+       struct vm_struct *blk_ring_area;
+       /* Back pointer to the backend_info. */
+       struct backend_info *be; 
+       /* Private fields. */
+       spinlock_t       blk_ring_lock;
+       atomic_t         refcnt;
+
+       wait_queue_head_t   wq;
+       struct task_struct  *xenblkd;
+       unsigned int        waiting_reqs;
+       request_queue_t     *plug;
+
+       /* statistics */
+       unsigned long       st_print;
+       int                 st_rd_req;
+       int                 st_wr_req;
+       int                 st_oo_req;
+
+       wait_queue_head_t waiting_to_free;
+
+       grant_handle_t shmem_handle;
+       grant_ref_t    shmem_ref;
+       
+       int             dev_num;
+       uint64_t        sectors;
+} blkif_t;
+
+blkif_t *tap_alloc_blkif(domid_t domid);
+void tap_blkif_free(blkif_t *blkif);
+int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, 
+                 unsigned int evtchn);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                                  \
+       do {                                            \
+               if (atomic_dec_and_test(&(_b)->refcnt)) \
+                       wake_up(&(_b)->waiting_to_free);\
+       } while (0)
+
+
+struct phys_req {
+       unsigned short       dev;
+       unsigned short       nr_sects;
+       struct block_device *bdev;
+       blkif_sector_t       sector_number;
+};
+
+void tap_blkif_interface_init(void);
+
+void tap_blkif_xenbus_init(void);
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int tap_blkif_schedule(void *arg);
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
+void signal_tapdisk(int idx);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/interface.c b/linux-2.6-xen-sparse/drivers/xen/blktap/interface.c
new file mode 100644 (file)
index 0000000..94727b1
--- /dev/null
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * drivers/xen/blktap/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+
+static kmem_cache_t *blkif_cachep;
+
+blkif_t *tap_alloc_blkif(domid_t domid)
+{
+       blkif_t *blkif;
+
+       blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+       if (!blkif)
+               return ERR_PTR(-ENOMEM);
+
+       memset(blkif, 0, sizeof(*blkif));
+       blkif->domid = domid;
+       spin_lock_init(&blkif->blk_ring_lock);
+       atomic_set(&blkif->refcnt, 1);
+       init_waitqueue_head(&blkif->wq);
+       blkif->st_print = jiffies;
+       init_waitqueue_head(&blkif->waiting_to_free);
+
+       return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+{
+       struct gnttab_map_grant_ref op;
+       int ret;
+
+       gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+                         GNTMAP_host_map, shared_page, blkif->domid);
+
+       lock_vm_area(blkif->blk_ring_area);
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
+       unlock_vm_area(blkif->blk_ring_area);
+       BUG_ON(ret);
+
+       if (op.status) {
+               DPRINTK(" Grant table operation failure !\n");
+               return op.status;
+       }
+
+       blkif->shmem_ref = shared_page;
+       blkif->shmem_handle = op.handle;
+
+#ifdef CONFIG_XEN_IA64_DOM0_NON_VP
+       /* on some arch's, map_grant_ref behaves like mmap, in that the
+        * passed address is a hint and a different address may be returned */
+       blkif->blk_ring_area->addr = gnttab_map_vaddr(op);
+#endif
+
+       return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+       struct gnttab_unmap_grant_ref op;
+       int ret;
+
+       gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+                           GNTMAP_host_map, blkif->shmem_handle);
+
+       lock_vm_area(blkif->blk_ring_area);
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
+       unlock_vm_area(blkif->blk_ring_area);
+       BUG_ON(ret);
+}
+
+int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, 
+                 unsigned int evtchn)
+{
+       blkif_sring_t *sring;
+       int err;
+       struct evtchn_bind_interdomain bind_interdomain;
+
+       /* Already connected through? */
+       if (blkif->irq)
+               return 0;
+
+       if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+               return -ENOMEM;
+
+       err = map_frontend_page(blkif, shared_page);
+       if (err) {
+               free_vm_area(blkif->blk_ring_area);
+               return err;
+       }
+
+       bind_interdomain.remote_dom  = blkif->domid;
+       bind_interdomain.remote_port = evtchn;
+
+       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+                                         &bind_interdomain);
+       if (err) {
+               unmap_frontend_page(blkif);
+               free_vm_area(blkif->blk_ring_area);
+               return err;
+       }
+
+       blkif->evtchn = bind_interdomain.local_port;
+
+       sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
+       BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
+
+       blkif->irq = bind_evtchn_to_irqhandler(
+               blkif->evtchn, tap_blkif_be_int, 0, "blkif-backend", blkif);
+
+       return 0;
+}
+
+void tap_blkif_free(blkif_t *blkif)
+{
+       atomic_dec(&blkif->refcnt);
+       wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+
+       /* Already disconnected? */
+       if (blkif->irq)
+               unbind_from_irqhandler(blkif->irq, blkif);
+
+       if (blkif->blk_ring.sring) {
+               unmap_frontend_page(blkif);
+               free_vm_area(blkif->blk_ring_area);
+       }
+
+       kmem_cache_free(blkif_cachep, blkif);
+}
+
+void __init tap_blkif_interface_init(void)
+{
+       blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t), 
+                                        0, 0, NULL, NULL);
+}
diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c
new file mode 100644 (file)
index 0000000..b1ad47b
--- /dev/null
@@ -0,0 +1,354 @@
+/* drivers/xen/blktap/xenbus.c
+ *
+ * Xenbus code for blktap
+ *
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * Based on the blkback xenbus code:
+ *
+ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+
+struct backend_info
+{
+       struct xenbus_device *dev;
+       blkif_t *blkif;
+       struct xenbus_watch backend_watch;
+       int xenbus_id;
+};
+
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static int blktap_remove(struct xenbus_device *dev);
+static int blktap_probe(struct xenbus_device *dev,
+                        const struct xenbus_device_id *id);
+static void tap_backend_changed(struct xenbus_watch *, const char **,
+                           unsigned int);
+static void tap_frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state);
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+        unsigned int i;
+
+        for (i = 0; str[i]; i++)
+                if (str[i] == c) {
+                        if (len == 0)
+                                return i;
+                        len--;
+                }
+        return (len == 0) ? i : -ERANGE;
+}
+
+static long get_id(const char *str)
+{
+        int len,end;
+        const char *ptr;
+        char *tptr, num[10];
+       
+        len = strsep_len(str, '/', 2);
+        end = strlen(str);
+        if ( (len < 0) || (end < 0) ) return -1;
+       
+        ptr = str + len + 1;
+        strncpy(num,ptr,end - len);
+        tptr = num + (end - (len + 1));
+        *tptr = '\0';
+       DPRINTK("Get_id called for %s (%s)\n",str,num);
+       
+        return simple_strtol(num, NULL, 10);
+}                              
+
+static void tap_update_blkif_status(blkif_t *blkif)
+{ 
+       int err;
+
+       /* Not ready to connect? */
+       if(!blkif->irq || !blkif->sectors) {
+               return;
+       } 
+
+       /* Already connected? */
+       if (blkif->be->dev->state == XenbusStateConnected)
+               return;
+
+       /* Attempt to connect: exit if we fail to. */
+       connect(blkif->be);
+       if (blkif->be->dev->state != XenbusStateConnected)
+               return;
+
+       blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif,
+                                    "xvd %d",
+                                    blkif->domid);
+
+       if (IS_ERR(blkif->xenblkd)) {
+               err = PTR_ERR(blkif->xenblkd);
+               blkif->xenblkd = NULL;
+               xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
+               WPRINTK("Error starting thread\n");
+       }
+}
+
+static int blktap_remove(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev->dev.driver_data;
+
+       if (be->backend_watch.node) {
+               unregister_xenbus_watch(&be->backend_watch);
+               kfree(be->backend_watch.node);
+               be->backend_watch.node = NULL;
+       }
+       if (be->blkif) {
+               if (be->blkif->xenblkd)
+                       kthread_stop(be->blkif->xenblkd);
+               signal_tapdisk(be->blkif->dev_num);
+               tap_blkif_free(be->blkif);
+               be->blkif = NULL;
+       }
+       kfree(be);
+       dev->dev.driver_data = NULL;
+       return 0;
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate
+ * the basic structures, and watch the store waiting for the
+ * user-space program to tell us the physical device info.  Switch to
+ * InitWait.
+ */
+static int blktap_probe(struct xenbus_device *dev,
+                        const struct xenbus_device_id *id)
+{
+       int err;
+       struct backend_info *be = kzalloc(sizeof(struct backend_info),
+                                         GFP_KERNEL);
+       if (!be) {
+               xenbus_dev_fatal(dev, -ENOMEM,
+                                "allocating backend structure");
+               return -ENOMEM;
+       }
+
+       be->dev = dev;
+       dev->dev.driver_data = be;
+       be->xenbus_id = get_id(dev->nodename);
+
+       be->blkif = tap_alloc_blkif(dev->otherend_id);
+       if (IS_ERR(be->blkif)) {
+               err = PTR_ERR(be->blkif);
+               be->blkif = NULL;
+               xenbus_dev_fatal(dev, err, "creating block interface");
+               goto fail;
+       }
+
+       /* setup back pointer */
+       be->blkif->be = be; 
+       be->blkif->sectors = 0;
+
+       /* set a watch on disk info, waiting for userspace to update details*/
+       err = xenbus_watch_path2(dev, dev->nodename, "info",
+                                &be->backend_watch, tap_backend_changed);
+       if (err)
+               goto fail;
+       
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto fail;
+       return 0;
+
+fail:
+       DPRINTK("blktap probe failed");
+       blktap_remove(dev);
+       return err;
+}
+
+
+/**
+ * Callback received when the user space code has placed the device
+ * information in xenstore. 
+ */
+static void tap_backend_changed(struct xenbus_watch *watch,
+                           const char **vec, unsigned int len)
+{
+       int err;
+       unsigned long info;
+       struct backend_info *be
+               = container_of(watch, struct backend_info, backend_watch);
+       struct xenbus_device *dev = be->dev;
+       
+       /** 
+        * Check to see whether userspace code has opened the image 
+        * and written sector
+        * and disk info to xenstore
+        */
+       err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info, 
+                           NULL);      
+       if (err) {
+               xenbus_dev_error(dev, err, "getting info");
+               return;
+       }
+
+       DPRINTK("Userspace update on disk info, %lu\n",info);
+
+       err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu", 
+                           &be->blkif->sectors, NULL);
+
+       /* Associate tap dev with domid*/
+       be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id, 
+                                         be->blkif);
+       DPRINTK("Thread started for domid [%d], connecting disk\n", 
+               be->blkif->dev_num);
+
+       tap_update_blkif_status(be->blkif);
+}
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void tap_frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state)
+{
+       struct backend_info *be = dev->dev.driver_data;
+       int err;
+
+       DPRINTK("");
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+               break;
+
+       case XenbusStateInitialised:
+       case XenbusStateConnected:
+               /* Ensure we connect even when two watches fire in 
+                  close successsion and we miss the intermediate value 
+                  of frontend_state. */
+               if (dev->state == XenbusStateConnected)
+                       break;
+
+               err = connect_ring(be);
+               if (err)
+                       break;
+               tap_update_blkif_status(be->blkif);
+               break;
+
+       case XenbusStateClosing:
+               xenbus_switch_state(dev, XenbusStateClosing);
+               break;
+
+       case XenbusStateClosed:
+               device_unregister(&dev->dev);
+               break;
+
+       case XenbusStateUnknown:
+       case XenbusStateInitWait:
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                frontend_state);
+               break;
+       }
+}
+
+
+/**
+ * Switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+       int err;
+
+       struct xenbus_device *dev = be->dev;
+
+       err = xenbus_switch_state(dev, XenbusStateConnected);
+       if (err)
+               xenbus_dev_fatal(dev, err, "switching to Connected state",
+                                dev->nodename);
+
+       return;
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       unsigned long ring_ref;
+       unsigned int evtchn;
+       int err;
+
+       DPRINTK("%s", dev->otherend);
+
+       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", 
+                           &ring_ref, "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                                "reading %s/ring-ref and event-channel",
+                                dev->otherend);
+               return err;
+       }
+
+       /* Map the shared frame, irq etc. */
+       err = tap_blkif_map(be->blkif, ring_ref, evtchn);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+                                ring_ref, evtchn);
+               return err;
+       } 
+
+       return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static struct xenbus_device_id blktap_ids[] = {
+       { "tap" },
+       { "" }
+};
+
+
+static struct xenbus_driver blktap = {
+       .name = "tap",
+       .owner = THIS_MODULE,
+       .ids = blktap_ids,
+       .probe = blktap_probe,
+       .remove = blktap_remove,
+       .otherend_changed = tap_frontend_changed
+};
+
+
+void tap_blkif_xenbus_init(void)
+{
+       xenbus_register_backend(&blktap);
+}
diff --git a/patches/linux-2.6.16.13/blktap-aio-16_03_06.patch b/patches/linux-2.6.16.13/blktap-aio-16_03_06.patch
new file mode 100644 (file)
index 0000000..5f4fd6f
--- /dev/null
@@ -0,0 +1,297 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/fs/aio.c ./fs/aio.c
+--- ../pristine-linux-2.6.16-rc5/fs/aio.c      2006-03-14 14:10:10.827401387 +0000
++++ ./fs/aio.c 2006-03-16 09:57:53.898316582 +0000
+@@ -34,6 +34,11 @@
+ #include <asm/uaccess.h>
+ #include <asm/mmu_context.h>
++#ifdef CONFIG_EPOLL
++#include <linux/poll.h>
++#include <linux/eventpoll.h>
++#endif
++
+ #if DEBUG > 1
+ #define dprintk               printk
+ #else
+@@ -1016,6 +1021,10 @@ put_rq:
+       if (waitqueue_active(&ctx->wait))
+               wake_up(&ctx->wait);
++#ifdef CONFIG_EPOLL
++      if (ctx->file && waitqueue_active(&ctx->poll_wait))
++              wake_up(&ctx->poll_wait);
++#endif
+       if (ret)
+               put_ioctx(ctx);
+@@ -1025,6 +1034,8 @@ put_rq:
+ /* aio_read_evt
+  *    Pull an event off of the ioctx's event ring.  Returns the number of 
+  *    events fetched (0 or 1 ;-)
++ *    If ent parameter is 0, just returns the number of events that would
++ *    be fetched.
+  *    FIXME: make this use cmpxchg.
+  *    TODO: make the ringbuffer user mmap()able (requires FIXME).
+  */
+@@ -1047,13 +1058,18 @@ static int aio_read_evt(struct kioctx *i
+       head = ring->head % info->nr;
+       if (head != ring->tail) {
+-              struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+-              *ent = *evp;
+-              head = (head + 1) % info->nr;
+-              smp_mb(); /* finish reading the event before updatng the head */
+-              ring->head = head;
+-              ret = 1;
+-              put_aio_ring_event(evp, KM_USER1);
++              if (ent) { /* event requested */
++                      struct io_event *evp =
++                              aio_ring_event(info, head, KM_USER1);
++                      *ent = *evp;
++                      head = (head + 1) % info->nr;
++                      /* finish reading the event before updatng the head */
++                      smp_mb();
++                      ring->head = head;
++                      ret = 1;
++                      put_aio_ring_event(evp, KM_USER1);
++              } else /* only need to know availability */
++                      ret = 1;
+       }
+       spin_unlock(&info->ring_lock);
+@@ -1236,9 +1252,78 @@ static void io_destroy(struct kioctx *io
+       aio_cancel_all(ioctx);
+       wait_for_all_aios(ioctx);
++#ifdef CONFIG_EPOLL
++      /* forget the poll file, but it's up to the user to close it */
++      if (ioctx->file) {
++              ioctx->file->private_data = 0;
++              ioctx->file = 0;
++      }
++#endif
+       put_ioctx(ioctx);       /* once for the lookup */
+ }
++#ifdef CONFIG_EPOLL
++
++static int aio_queue_fd_close(struct inode *inode, struct file *file)
++{
++      struct kioctx *ioctx = file->private_data;
++      if (ioctx) {
++              file->private_data = 0;
++              spin_lock_irq(&ioctx->ctx_lock);
++              ioctx->file = 0;
++              spin_unlock_irq(&ioctx->ctx_lock);
++      }
++      return 0;
++}
++
++static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
++{     unsigned int pollflags = 0;
++      struct kioctx *ioctx = file->private_data;
++
++      if (ioctx) {
++
++              spin_lock_irq(&ioctx->ctx_lock);
++              /* Insert inside our poll wait queue */
++              poll_wait(file, &ioctx->poll_wait, wait);
++
++              /* Check our condition */
++              if (aio_read_evt(ioctx, 0))
++                      pollflags = POLLIN | POLLRDNORM;
++              spin_unlock_irq(&ioctx->ctx_lock);
++      }
++
++      return pollflags;
++}
++
++static struct file_operations aioq_fops = {
++      .release        = aio_queue_fd_close,
++      .poll           = aio_queue_fd_poll
++};
++
++/* make_aio_fd:
++ *  Create a file descriptor that can be used to poll the event queue.
++ *  Based and piggybacked on the excellent epoll code.
++ */
++
++static int make_aio_fd(struct kioctx *ioctx)
++{
++      int error, fd;
++      struct inode *inode;
++      struct file *file;
++
++      error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops);
++      if (error)
++              return error;
++
++      /* associate the file with the IO context */
++      file->private_data = ioctx;
++      ioctx->file = file;
++      init_waitqueue_head(&ioctx->poll_wait);
++      return fd;
++}
++#endif
++
++
+ /* sys_io_setup:
+  *    Create an aio_context capable of receiving at least nr_events.
+  *    ctxp must not point to an aio_context that already exists, and
+@@ -1251,18 +1336,30 @@ static void io_destroy(struct kioctx *io
+  *    resources are available.  May fail with -EFAULT if an invalid
+  *    pointer is passed for ctxp.  Will fail with -ENOSYS if not
+  *    implemented.
++ *
++ *    To request a selectable fd, the user context has to be initialized
++ *    to 1, instead of 0, and the return value is the fd.
++ *    This keeps the system call compatible, since a non-zero value
++ *    was not allowed so far.
+  */
+ asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
+ {
+       struct kioctx *ioctx = NULL;
+       unsigned long ctx;
+       long ret;
++      int make_fd = 0;
+       ret = get_user(ctx, ctxp);
+       if (unlikely(ret))
+               goto out;
+       ret = -EINVAL;
++#ifdef CONFIG_EPOLL
++      if (ctx == 1) {
++              make_fd = 1;
++              ctx = 0;
++      }
++#endif
+       if (unlikely(ctx || nr_events == 0)) {
+               pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
+                        ctx, nr_events);
+@@ -1273,8 +1370,12 @@ asmlinkage long sys_io_setup(unsigned nr
+       ret = PTR_ERR(ioctx);
+       if (!IS_ERR(ioctx)) {
+               ret = put_user(ioctx->user_id, ctxp);
+-              if (!ret)
+-                      return 0;
++#ifdef CONFIG_EPOLL
++              if (make_fd && ret >= 0)
++                      ret = make_aio_fd(ioctx);
++#endif
++              if (ret >= 0)
++                      return ret;
+               get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
+               io_destroy(ioctx);
+
+diff -pruN ../pristine-linux-2.6.16-rc5/fs/eventpoll.c ./fs/eventpoll.c
+--- ../pristine-linux-2.6.16-rc5/fs/eventpoll.c        2006-01-03 03:21:10.000000000 +0000
++++ ./fs/eventpoll.c   2006-03-16 10:04:35.469956167 +0000
+@@ -235,8 +235,6 @@ struct ep_pqueue {
+ static void ep_poll_safewake_init(struct poll_safewake *psw);
+ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
+-static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
+-                  struct eventpoll *ep);
+ static int ep_alloc(struct eventpoll **pep);
+ static void ep_free(struct eventpoll *ep);
+ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
+@@ -266,7 +264,7 @@ static int ep_events_transfer(struct eve
+ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
+                  int maxevents, long timeout);
+ static int eventpollfs_delete_dentry(struct dentry *dentry);
+-static struct inode *ep_eventpoll_inode(void);
++static struct inode *ep_eventpoll_inode(struct file_operations *fops);
+ static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
+                                             int flags, const char *dev_name,
+                                             void *data);
+@@ -525,7 +523,7 @@ asmlinkage long sys_epoll_create(int siz
+        * Creates all the items needed to setup an eventpoll file. That is,
+        * a file structure, and inode and a free file descriptor.
+        */
+-      error = ep_getfd(&fd, &inode, &file, ep);
++      error = ep_getfd(&fd, &inode, &file, ep, &eventpoll_fops);
+       if (error)
+               goto eexit_2;
+@@ -710,8 +708,8 @@ eexit_1:
+ /*
+  * Creates the file descriptor to be used by the epoll interface.
+  */
+-static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
+-                  struct eventpoll *ep)
++int ep_getfd(int *efd, struct inode **einode, struct file **efile,
++                  struct eventpoll *ep, struct file_operations *fops)
+ {
+       struct qstr this;
+       char name[32];
+@@ -727,7 +725,7 @@ static int ep_getfd(int *efd, struct ino
+               goto eexit_1;
+       /* Allocates an inode from the eventpoll file system */
+-      inode = ep_eventpoll_inode();
++      inode = ep_eventpoll_inode(fops);
+       error = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto eexit_2;
+@@ -758,7 +756,7 @@ static int ep_getfd(int *efd, struct ino
+       file->f_pos = 0;
+       file->f_flags = O_RDONLY;
+-      file->f_op = &eventpoll_fops;
++      file->f_op = fops;
+       file->f_mode = FMODE_READ;
+       file->f_version = 0;
+       file->private_data = ep;
+@@ -1574,7 +1572,7 @@ static int eventpollfs_delete_dentry(str
+ }
+-static struct inode *ep_eventpoll_inode(void)
++static struct inode *ep_eventpoll_inode(struct file_operations *fops)
+ {
+       int error = -ENOMEM;
+       struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
+@@ -1582,7 +1580,7 @@ static struct inode *ep_eventpoll_inode(
+       if (!inode)
+               goto eexit_1;
+-      inode->i_fop = &eventpoll_fops;
++      inode->i_fop = fops;
+       /*
+        * Mark the inode dirty from the very beginning,
+
+diff -pruN ../pristine-linux-2.6.16-rc5/include/linux/aio.h ./include/linux/aio.h
+--- ../pristine-linux-2.6.16-rc5/include/linux/aio.h   2006-03-14 14:10:21.597916731 +0000
++++ ./include/linux/aio.h      2006-03-16 10:05:39.848833028 +0000
+@@ -191,6 +191,11 @@ struct kioctx {
+       struct aio_ring_info    ring_info;
+       struct work_struct      wq;
++#ifdef CONFIG_EPOLL
++      // poll integration
++      wait_queue_head_t       poll_wait;
++      struct file             *file;
++#endif
+ };
+ /* prototypes */
+
+diff -pruN ../pristine-linux-2.6.16-rc5/include/linux/eventpoll.h ./include/linux/eventpoll.h
+--- ../pristine-linux-2.6.16-rc5/include/linux/eventpoll.h     2006-01-03 03:21:10.000000000 +0000
++++ ./include/linux/eventpoll.h        2006-03-16 10:08:51.577809317 +0000
+@@ -86,6 +86,12 @@ static inline void eventpoll_release(str
+ }
++/*
++ * called by aio code to create fd that can poll the  aio event queueQ
++ */
++struct eventpoll;
++int ep_getfd(int *efd, struct inode **einode, struct file **efile,
++             struct eventpoll *ep, struct file_operations *fops);
+ #else
+ static inline void eventpoll_init_file(struct file *file) {}
index ac41f2f32177f8ee859e33a8726e5cc0156ec2e6..2a42254e322b0d358cf9ac29981aefad84243a8b 100644 (file)
@@ -16,6 +16,8 @@ SUBDIRS-y += guest-headers
 SUBDIRS-$(VTPM_TOOLS) += vtpm_manager
 SUBDIRS-$(VTPM_TOOLS) += vtpm
 SUBDIRS-y += xenstat
+SUBDIRS-y += libaio
+SUBDIRS-y += blktap
 
 # These don't cross-compile
 ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
diff --git a/tools/blktap/Makefile b/tools/blktap/Makefile
new file mode 100644 (file)
index 0000000..fb194f3
--- /dev/null
@@ -0,0 +1,28 @@
+XEN_ROOT = ../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+SUBDIRS-y :=
+SUBDIRS-y += lib
+SUBDIRS-y += drivers
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build: mk-symlinks
+       @set -e; for subdir in $(SUBDIRS-y); do \
+       $(MAKE) -C $$subdir all;       \
+               done
+
+.PHONY: install
+install:
+       @set -e; for subdir in $(SUBDIRS-y); do \
+               $(MAKE) -C $$subdir install; \
+       done
+
+.PHONY: clean
+clean:
+       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS
+       @set -e; for subdir in $(SUBDIRS-y); do \
+       $(MAKE) -C $$subdir clean;       \
+               done
diff --git a/tools/blktap/README b/tools/blktap/README
new file mode 100644 (file)
index 0000000..5e41080
--- /dev/null
@@ -0,0 +1,122 @@
+Blktap Userspace Tools + Library
+================================
+
+Andrew Warfield and Julian Chesterfield
+16th June 2006
+
+{firstname.lastname}@cl.cam.ac.uk
+
+The blktap userspace toolkit provides a user-level disk I/O
+interface. The blktap mechanism involves a kernel driver that acts
+similarly to the existing Xen/Linux blkback driver, and a set of
+associated user-level libraries.  Using these tools, blktap allows
+virtual block devices presented to VMs to be implemented in userspace
+and to be backed by raw partitions, files, network, etc.
+
+The key benefit of blktap is that it makes it easy and fast to write
+arbitrary block backends, and that these user-level backends actually
+perform very well.  Specifically:
+
+- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse
+  formats and other compression features can be easily implemented.
+
+- Accessing file-based images from userspace avoids problems related
+  to flushing dirty pages which are present in the Linux loopback
+  driver.  (Specifically, doing a large number of writes to an
+  NFS-backed image don't result in the OOM killer going berserk.)
+
+- Per-disk handler processes enable easier userspace policing of block
+  resources, and process-granularity QoS techniques (disk scheduling
+  and related tools) may be trivially applied to block devices.
+
+- It's very easy to take advantage of userspace facilities such as
+  networking libraries, compression utilities, peer-to-peer
+  file-sharing systems and so on to build more complex block backends.
+
+- Crashes are contained -- incremental development/debugging is very
+  fast.
+
+How it works (in one paragraph):
+
+Working in conjunction with the kernel blktap driver, all disk I/O
+requests from VMs are passed to the userspace deamon (using a shared
+memory interface) through a character device. Each active disk is
+mapped to an individual device node, allowing per-disk processes to
+implement individual block devices where desired.  The userspace
+drivers are implemented using asynchronous (Linux libaio),
+O_DIRECT-based calls to preserve the unbuffered, batched and
+asynchronous request dispatch achieved with the existing blkback
+code.  We provide a simple, asynchronous virtual disk interface that
+makes it quite easy to add new disk implementations.
+
+As of June 2006 the current supported disk formats are:
+
+ - Raw Images (both on partitions and in image files)
+ - File-backed Qcow disks
+ - Standalone sparse Qcow disks
+ - Fast shareable RAM disk between VMs (requires some form of cluster-based 
+   filesystem support e.g. OCFS2 in the guest kernel)
+ - Some VMDK images - your mileage may vary
+
+Raw and QCow images have asynchronous backends and so should perform
+fairly well.  VMDK is based directly on the qemu vmdk driver, which is
+synchronous (a.k.a. slow).
+
+Build and Installation Instructions
+===================================
+
+Make to configure the blktap backend driver in your dom0 kernel.  It
+will cooperate fine with the existing backend driver, so you can
+experiment with tap disks without breaking existing VM configs.
+
+To build the tools separately, "make && make install" in 
+tools/blktap.
+
+
+Using the Tools
+===============
+
+Prepare the image for booting. For qcow files use the qcow utilities
+installed earlier. e.g. qcow-create generates a blank standalone image
+or a file-backed CoW image. img2qcow takes an existing image or
+partition and creates a sparse, standalone qcow-based file.
+
+The userspace disk agent is configured to start automatically via xend
+(alternatively you can start it manually => 'blktapctrl')
+
+Customise the VM config file to use the 'tap' handler, followed by the
+driver type. e.g. for a raw image such as a file or partition:
+
+disk = ['tap:aio:<FILENAME>,sda1,w']
+
+e.g. for a qcow image:
+
+disk = ['tap:qcow:<FILENAME>,sda1,w']
+
+
+Mounting images in Dom0 using the blktap driver
+===============================================
+Tap (and blkback) disks are also mountable in Dom0 without requiring an
+active VM to attach. You will need to build a xenlinux Dom0 kernel that
+includes the blkfront driver (e.g. the default 'make world' or 
+'make kernels' build. Simply use the xm command-line tool to activate
+the backend disks, and blkfront will generate a virtual block device that
+can be accessed in the same way as a loop device or partition:
+
+e.g. for a raw image file <FILENAME> that would normally be mounted using
+the loopback driver (such as 'mount -o loop <FILENAME> /mnt/disk'), do the
+following:
+
+xm block-attach 0 tap:aio:<FILENAME> /dev/xvda1 w 0
+mount /dev/xvda1 /mnt/disk        <--- don't use loop driver
+
+In this way, you can use any of the userspace device-type drivers built
+with the blktap userspace toolkit to open and mount disks such as qcow
+or vmdk images:
+
+xm block-attach 0 tap:qcow:<FILENAME> /dev/xvda1 w 0
+mount /dev/xvda1 /mnt/disk
+
+
+
diff --git a/tools/blktap/drivers/Makefile b/tools/blktap/drivers/Makefile
new file mode 100644 (file)
index 0000000..6601a4d
--- /dev/null
@@ -0,0 +1,76 @@
+XEN_ROOT = ../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+INCLUDES += -I.. -I../lib
+
+INSTALL      = install
+INSTALL_PROG = $(INSTALL) -m0755
+IBIN         = blktapctrl tapdisk
+QCOW_UTIL    = img2qcow qcow2raw qcow-create
+INSTALL_DIR  = /usr/sbin
+LIBAIO_DIR   = ../../libaio/src
+
+CFLAGS   += -fPIC
+CFLAGS   += -Wall
+CFLAGS   += -Werror
+CFLAGS   += -Wno-unused
+CFLAGS   += -g3
+CFLAGS   += -fno-strict-aliasing
+CFLAGS   += -I $(XEN_LIBXC) -I $(LIBAIO_DIR)
+CFLAGS   += $(INCLUDES) -I. -I../../xenstore 
+CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+CFLAGS   += -D_GNU_SOURCE
+
+# Get gcc to generate the dependencies for us.
+CFLAGS   += -Wp,-MD,.$(@F).d
+DEPS     = .*.d
+
+THREADLIB := -lpthread -lz
+LIBS      := -L. -L.. -L../lib
+LIBS      += -L$(XEN_LIBXC)
+LIBS      += -lblktap
+LIBS      += -lcrypto
+LIBS      += -lz
+LIBS      += -L$(XEN_XENSTORE) -lxenstore
+
+AIOLIBS   := -L $(LIBAIO_DIR)
+AIOLIBS   += -laio
+AIOLIBS   += -static
+
+BLK-OBJS  := block-aio.o 
+BLK-OBJS  += block-sync.o 
+BLK-OBJS  += block-vmdk.o
+BLK-OBJS  += block-ram.o 
+BLK-OBJS  += block-qcow.o
+BLK-OBJS  += aes.o
+
+all: $(IBIN) qcow-util
+
+LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
+
+
+blktapctrl: 
+       $(CC) $(CFLAGS) -o blktapctrl $(LIBS) blktapctrl.c
+
+tapdisk: $(BLK-OBJS)
+       $(CC) $(CFLAGS) -o tapdisk $(BLK-OBJS) tapdisk.c \
+               $(AIOLIBS) $(LIBS)
+
+
+qcow-util: $(BLK-OBJS)
+       $(CC) $(CFLAGS) -o img2qcow $(BLK-OBJS) img2qcow.c \
+               $(AIOLIBS)  $(LIBS)
+       $(CC) $(CFLAGS) -o qcow2raw $(BLK-OBJS) qcow2raw.c  \
+               $(AIOLIBS)  $(LIBS)
+       $(CC) $(CFLAGS) -o qcow-create $(BLK-OBJS) qcow-create.c  \
+               $(AIOLIBS)  $(LIBS)
+
+install: all
+       $(INSTALL_PROG) $(IBIN) $(QCOW_UTIL) $(DESTDIR)$(INSTALL_DIR)
+
+clean:
+       rm -rf *.o *~ $(DEPS) xen TAGS $(IBIN) $(LIB) $(QCOW_UTIL)
+
+.PHONY: clean install
+
+-include $(DEPS)
diff --git a/tools/blktap/drivers/aes.c b/tools/blktap/drivers/aes.c
new file mode 100644 (file)
index 0000000..4d83fac
--- /dev/null
@@ -0,0 +1,1319 @@
+/**
+ * 
+ * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project.
+ */
+/*
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+//#include "vl.h"
+#include <inttypes.h>
+#include <string.h>
+#include "aes.h"
+
+//#define NDEBUG
+#include <assert.h>
+
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+#define MAXKC   (256/32)
+#define MAXKB   (256/8)
+#define MAXNR   14
+
+/* This controls loop-unrolling in aes_core.c */
+#undef FULL_UNROLL
+# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))
+# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }
+
+/*
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+Te4[x] = S [x].[01, 01, 01, 01];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01, 01, 01, 01];
+*/
+
+static const u32 Te0[256] = {
+    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
+};
+static const u32 Te1[256] = {
+    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
+};
+static const u32 Te2[256] = {
+    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
+};
+static const u32 Te3[256] = {
+
+    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
+};
+static const u32 Te4[256] = {
+    0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
+    0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
+    0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
+    0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
+    0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
+    0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
+    0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
+    0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
+    0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
+    0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
+    0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
+    0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
+    0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
+    0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
+    0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
+    0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
+    0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
+    0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
+    0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
+    0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
+    0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
+    0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
+    0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
+    0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
+    0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
+    0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
+    0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
+    0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
+    0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
+    0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
+    0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
+    0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
+    0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
+    0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
+    0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
+    0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
+    0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
+    0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
+    0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
+    0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
+    0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
+    0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
+    0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
+    0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
+    0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
+    0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
+    0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
+    0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
+    0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
+    0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
+    0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
+    0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
+    0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
+    0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
+    0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
+    0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
+    0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
+    0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
+    0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
+    0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
+    0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
+    0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
+    0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
+    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
+};
+static const u32 Td0[256] = {
+    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
+};
+static const u32 Td1[256] = {
+    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
+};
+static const u32 Td2[256] = {
+    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+
+    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
+};
+static const u32 Td3[256] = {
+    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
+};
+static const u32 Td4[256] = {
+    0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
+    0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
+    0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
+    0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
+    0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
+    0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
+    0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
+    0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
+    0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
+    0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
+    0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
+    0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
+    0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
+    0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
+    0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
+    0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
+    0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
+    0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
+    0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
+    0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
+    0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
+    0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
+    0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
+    0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
+    0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
+    0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
+    0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
+    0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
+    0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
+    0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
+    0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
+    0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
+    0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
+    0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
+    0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
+    0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
+    0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
+    0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
+    0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
+    0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
+    0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
+    0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
+    0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
+    0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
+    0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
+    0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
+    0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
+    0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
+    0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
+    0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
+    0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
+    0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
+    0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
+    0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
+    0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
+    0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
+    0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
+    0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
+    0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
+    0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
+    0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
+    0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
+    0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
+    0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,
+};
+static const u32 rcon[] = {
+       0x01000000, 0x02000000, 0x04000000, 0x08000000,
+       0x10000000, 0x20000000, 0x40000000, 0x80000000,
+       0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+                       AES_KEY *key) {
+
+       u32 *rk;
+       int i = 0;
+       u32 temp;
+
+       if (!userKey || !key)
+               return -1;
+       if (bits != 128 && bits != 192 && bits != 256)
+               return -2;
+
+       rk = key->rd_key;
+
+       if (bits==128)
+               key->rounds = 10;
+       else if (bits==192)
+               key->rounds = 12;
+       else
+               key->rounds = 14;
+
+       rk[0] = GETU32(userKey     );
+       rk[1] = GETU32(userKey +  4);
+       rk[2] = GETU32(userKey +  8);
+       rk[3] = GETU32(userKey + 12);
+       if (bits == 128) {
+               while (1) {
+                       temp  = rk[3];
+                       rk[4] = rk[0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[5] = rk[1] ^ rk[4];
+                       rk[6] = rk[2] ^ rk[5];
+                       rk[7] = rk[3] ^ rk[6];
+                       if (++i == 10) {
+                               return 0;
+                       }
+                       rk += 4;
+               }
+       }
+       rk[4] = GETU32(userKey + 16);
+       rk[5] = GETU32(userKey + 20);
+       if (bits == 192) {
+               while (1) {
+                       temp = rk[ 5];
+                       rk[ 6] = rk[ 0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[ 7] = rk[ 1] ^ rk[ 6];
+                       rk[ 8] = rk[ 2] ^ rk[ 7];
+                       rk[ 9] = rk[ 3] ^ rk[ 8];
+                       if (++i == 8) {
+                               return 0;
+                       }
+                       rk[10] = rk[ 4] ^ rk[ 9];
+                       rk[11] = rk[ 5] ^ rk[10];
+                       rk += 6;
+               }
+       }
+       rk[6] = GETU32(userKey + 24);
+       rk[7] = GETU32(userKey + 28);
+       if (bits == 256) {
+               while (1) {
+                       temp = rk[ 7];
+                       rk[ 8] = rk[ 0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[ 9] = rk[ 1] ^ rk[ 8];
+                       rk[10] = rk[ 2] ^ rk[ 9];
+                       rk[11] = rk[ 3] ^ rk[10];
+                       if (++i == 7) {
+                               return 0;
+                       }
+                       temp = rk[11];
+                       rk[12] = rk[ 4] ^
+                               (Te4[(temp >> 24)       ] & 0xff000000) ^
+                               (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp      ) & 0xff] & 0x000000ff);
+                       rk[13] = rk[ 5] ^ rk[12];
+                       rk[14] = rk[ 6] ^ rk[13];
+                       rk[15] = rk[ 7] ^ rk[14];
+
+                       rk += 8;
+               }
+       }
+       return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key) {
+
+        u32 *rk;
+       int i, j, status;
+       u32 temp;
+
+       /* first, start with an encryption schedule */
+       status = AES_set_encrypt_key(userKey, bits, key);
+       if (status < 0)
+               return status;
+
+       rk = key->rd_key;
+
+       /* invert the order of the round keys: */
+       for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+               temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+               temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+               temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+               temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+       }
+       /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+       for (i = 1; i < (key->rounds); i++) {
+               rk += 4;
+               rk[0] =
+                       Td0[Te4[(rk[0] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[0] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[0]      ) & 0xff] & 0xff];
+               rk[1] =
+                       Td0[Te4[(rk[1] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[1] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[1]      ) & 0xff] & 0xff];
+               rk[2] =
+                       Td0[Te4[(rk[2] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[2] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[2]      ) & 0xff] & 0xff];
+               rk[3] =
+                       Td0[Te4[(rk[3] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[3] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[3]      ) & 0xff] & 0xff];
+       }
+       return 0;
+}
+
+#ifndef AES_ASM
+/*
+ * Encrypt a single block
+ * in and out can overlap
+ */
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+                const AES_KEY *key) {
+
+       const u32 *rk;
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+       int r;
+#endif /* ?FULL_UNROLL */
+
+       assert(in && out && key);
+       rk = key->rd_key;
+
+       /*
+        * map byte array block to cipher state
+        * and add initial round key:
+        */
+       s0 = GETU32(in     ) ^ rk[0];
+       s1 = GETU32(in +  4) ^ rk[1];
+       s2 = GETU32(in +  8) ^ rk[2];
+       s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+       /* round 1: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+       /* round 2: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+       /* round 3: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+       /* round 4: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+       /* round 5: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+       /* round 6: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+       /* round 7: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+       /* round 8: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+       /* round 9: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+    if (key->rounds > 10) {
+        /* round 10: */
+        s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+        s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+        s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+        s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+        /* round 11: */
+        t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+        t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+        t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+        t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+        if (key->rounds > 12) {
+            /* round 12: */
+            s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+            s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+            s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+            s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+            /* round 13: */
+            t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+            t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+            t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+            t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+        }
+    }
+    rk += key->rounds << 2;
+#else  /* !FULL_UNROLL */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = key->rounds >> 1;
+    for (;;) {
+        t0 =
+            Te0[(s0 >> 24)       ] ^
+            Te1[(s1 >> 16) & 0xff] ^
+            Te2[(s2 >>  8) & 0xff] ^
+            Te3[(s3      ) & 0xff] ^
+            rk[4];
+        t1 =
+            Te0[(s1 >> 24)       ] ^
+            Te1[(s2 >> 16) & 0xff] ^
+            Te2[(s3 >>  8) & 0xff] ^
+            Te3[(s0      ) & 0xff] ^
+            rk[5];
+        t2 =
+            Te0[(s2 >> 24)       ] ^
+            Te1[(s3 >> 16) & 0xff] ^
+            Te2[(s0 >>  8) & 0xff] ^
+            Te3[(s1      ) & 0xff] ^
+            rk[6];
+        t3 =
+            Te0[(s3 >> 24)       ] ^
+            Te1[(s0 >> 16) & 0xff] ^
+            Te2[(s1 >>  8) & 0xff] ^
+            Te3[(s2      ) & 0xff] ^
+            rk[7];
+
+        rk += 8;
+        if (--r == 0) {
+            break;
+        }
+
+        s0 =
+            Te0[(t0 >> 24)       ] ^
+            Te1[(t1 >> 16) & 0xff] ^
+            Te2[(t2 >>  8) & 0xff] ^
+            Te3[(t3      ) & 0xff] ^
+            rk[0];
+        s1 =
+            Te0[(t1 >> 24)       ] ^
+            Te1[(t2 >> 16) & 0xff] ^
+            Te2[(t3 >>  8) & 0xff] ^
+            Te3[(t0      ) & 0xff] ^
+            rk[1];
+        s2 =
+            Te0[(t2 >> 24)       ] ^
+            Te1[(t3 >> 16) & 0xff] ^
+            Te2[(t0 >>  8) & 0xff] ^
+            Te3[(t1      ) & 0xff] ^
+            rk[2];
+        s3 =
+            Te0[(t3 >> 24)       ] ^
+            Te1[(t0 >> 16) & 0xff] ^
+            Te2[(t1 >>  8) & 0xff] ^
+            Te3[(t2      ) & 0xff] ^
+            rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+        * apply last round and
+        * map cipher state to byte array block:
+        */
+       s0 =
+               (Te4[(t0 >> 24)       ] & 0xff000000) ^
+               (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t3      ) & 0xff] & 0x000000ff) ^
+               rk[0];
+       PUTU32(out     , s0);
+       s1 =
+               (Te4[(t1 >> 24)       ] & 0xff000000) ^
+               (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t0      ) & 0xff] & 0x000000ff) ^
+               rk[1];
+       PUTU32(out +  4, s1);
+       s2 =
+               (Te4[(t2 >> 24)       ] & 0xff000000) ^
+               (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t1      ) & 0xff] & 0x000000ff) ^
+               rk[2];
+       PUTU32(out +  8, s2);
+       s3 =
+               (Te4[(t3 >> 24)       ] & 0xff000000) ^
+               (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t2      ) & 0xff] & 0x000000ff) ^
+               rk[3];
+       PUTU32(out + 12, s3);
+}
+
+/*
+ * Decrypt a single block
+ * in and out can overlap
+ */
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+                const AES_KEY *key) {
+
+       const u32 *rk;
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+       int r;
+#endif /* ?FULL_UNROLL */
+
+       assert(in && out && key);
+       rk = key->rd_key;
+
+       /*
+        * map byte array block to cipher state
+        * and add initial round key:
+        */
+    s0 = GETU32(in     ) ^ rk[0];
+    s1 = GETU32(in +  4) ^ rk[1];
+    s2 = GETU32(in +  8) ^ rk[2];
+    s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+    /* round 1: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+    /* round 2: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+    /* round 3: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+    /* round 4: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+    /* round 5: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+    /* round 6: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+    /* round 7: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+    /* round 8: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+    /* round 9: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+    if (key->rounds > 10) {
+        /* round 10: */
+        s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+        s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+        s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+        s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+        /* round 11: */
+        t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+        t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+        t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+        t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+        if (key->rounds > 12) {
+            /* round 12: */
+            s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+            s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+            s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+            s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+            /* round 13: */
+            t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+            t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+            t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+            t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+        }
+    }
+       rk += key->rounds << 2;
+#else  /* !FULL_UNROLL */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = key->rounds >> 1;
+    for (;;) {
+        t0 =
+            Td0[(s0 >> 24)       ] ^
+            Td1[(s3 >> 16) & 0xff] ^
+            Td2[(s2 >>  8) & 0xff] ^
+            Td3[(s1      ) & 0xff] ^
+            rk[4];
+        t1 =
+            Td0[(s1 >> 24)       ] ^
+            Td1[(s0 >> 16) & 0xff] ^
+            Td2[(s3 >>  8) & 0xff] ^
+            Td3[(s2      ) & 0xff] ^
+            rk[5];
+        t2 =
+            Td0[(s2 >> 24)       ] ^
+            Td1[(s1 >> 16) & 0xff] ^
+            Td2[(s0 >>  8) & 0xff] ^
+            Td3[(s3      ) & 0xff] ^
+            rk[6];
+        t3 =
+            Td0[(s3 >> 24)       ] ^
+            Td1[(s2 >> 16) & 0xff] ^
+            Td2[(s1 >>  8) & 0xff] ^
+            Td3[(s0      ) & 0xff] ^
+            rk[7];
+
+        rk += 8;
+        if (--r == 0) {
+            break;
+        }
+
+        s0 =
+            Td0[(t0 >> 24)       ] ^
+            Td1[(t3 >> 16) & 0xff] ^
+            Td2[(t2 >>  8) & 0xff] ^
+            Td3[(t1      ) & 0xff] ^
+            rk[0];
+        s1 =
+            Td0[(t1 >> 24)       ] ^
+            Td1[(t0 >> 16) & 0xff] ^
+            Td2[(t3 >>  8) & 0xff] ^
+            Td3[(t2      ) & 0xff] ^
+            rk[1];
+        s2 =
+            Td0[(t2 >> 24)       ] ^
+            Td1[(t1 >> 16) & 0xff] ^
+            Td2[(t0 >>  8) & 0xff] ^
+            Td3[(t3      ) & 0xff] ^
+            rk[2];
+        s3 =
+            Td0[(t3 >> 24)       ] ^
+            Td1[(t2 >> 16) & 0xff] ^
+            Td2[(t1 >>  8) & 0xff] ^
+            Td3[(t0      ) & 0xff] ^
+            rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+        * apply last round and
+        * map cipher state to byte array block:
+        */
+       s0 =
+               (Td4[(t0 >> 24)       ] & 0xff000000) ^
+               (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t1      ) & 0xff] & 0x000000ff) ^
+               rk[0];
+       PUTU32(out     , s0);
+       s1 =
+               (Td4[(t1 >> 24)       ] & 0xff000000) ^
+               (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t2      ) & 0xff] & 0x000000ff) ^
+               rk[1];
+       PUTU32(out +  4, s1);
+       s2 =
+               (Td4[(t2 >> 24)       ] & 0xff000000) ^
+               (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t3      ) & 0xff] & 0x000000ff) ^
+               rk[2];
+       PUTU32(out +  8, s2);
+       s3 =
+               (Td4[(t3 >> 24)       ] & 0xff000000) ^
+               (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t0      ) & 0xff] & 0x000000ff) ^
+               rk[3];
+       PUTU32(out + 12, s3);
+}
+
+#endif /* AES_ASM */
+
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                    const unsigned long length, const AES_KEY *key,
+                    unsigned char *ivec, const int enc) 
+{
+
+       unsigned long n;
+       unsigned long len = length;
+       unsigned char tmp[AES_BLOCK_SIZE];
+
+       assert(in && out && key && ivec);
+
+       if (enc) {
+               while (len >= AES_BLOCK_SIZE) {
+                       for(n=0; n < AES_BLOCK_SIZE; ++n)
+                               tmp[n] = in[n] ^ ivec[n];
+                       AES_encrypt(tmp, out, key);
+                       memcpy(ivec, out, AES_BLOCK_SIZE);
+                       len -= AES_BLOCK_SIZE;
+                       in += AES_BLOCK_SIZE;
+                       out += AES_BLOCK_SIZE;
+               }
+               if (len) {
+                       for(n=0; n < len; ++n)
+                               tmp[n] = in[n] ^ ivec[n];
+                       for(n=len; n < AES_BLOCK_SIZE; ++n)
+                               tmp[n] = ivec[n];
+                       AES_encrypt(tmp, tmp, key);
+                       memcpy(out, tmp, AES_BLOCK_SIZE);
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);
+               }                       
+       } else {
+               while (len >= AES_BLOCK_SIZE) {
+                       memcpy(tmp, in, AES_BLOCK_SIZE);
+                       AES_decrypt(in, out, key);
+                       for(n=0; n < AES_BLOCK_SIZE; ++n)
+                               out[n] ^= ivec[n];
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);
+                       len -= AES_BLOCK_SIZE;
+                       in += AES_BLOCK_SIZE;
+                       out += AES_BLOCK_SIZE;
+               }
+               if (len) {
+                       memcpy(tmp, in, AES_BLOCK_SIZE);
+                       AES_decrypt(tmp, tmp, key);
+                       for(n=0; n < len; ++n)
+                               out[n] = tmp[n] ^ ivec[n];
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);
+               }                       
+       }
+}
diff --git a/tools/blktap/drivers/aes.h b/tools/blktap/drivers/aes.h
new file mode 100644 (file)
index 0000000..a0167eb
--- /dev/null
@@ -0,0 +1,26 @@
+#ifndef QEMU_AES_H
+#define QEMU_AES_H
+
+#define AES_MAXNR 14
+#define AES_BLOCK_SIZE 16
+
+struct aes_key_st {
+    uint32_t rd_key[4 *(AES_MAXNR + 1)];
+    int rounds;
+};
+typedef struct aes_key_st AES_KEY;
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+       AES_KEY *key);
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+       AES_KEY *key);
+
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+       const AES_KEY *key);
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+       const AES_KEY *key);
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                    const unsigned long length, const AES_KEY *key,
+                    unsigned char *ivec, const int enc);
+
+#endif
diff --git a/tools/blktap/drivers/blktapctrl.c b/tools/blktap/drivers/blktapctrl.c
new file mode 100644 (file)
index 0000000..f4ade5b
--- /dev/null
@@ -0,0 +1,704 @@
+/*
+ * blktapctrl.c
+ * 
+ * userspace controller for the blktap disks.
+ * As requests for new block devices arrive,
+ * the controller spawns off a separate process
+ * per-disk.
+ *
+ *
+ * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <err.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <linux/types.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/poll.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <unistd.h>
+#include <xs.h>
+#include <printf.h>
+#include <sys/time.h>
+#include <syslog.h>
+                                                                     
+#include "blktaplib.h"
+#include "blktapctrl.h"
+#include "tapdisk.h"
+
+#define NUM_POLL_FDS 2
+#define MSG_SIZE 4096
+#define MAX_TIMEOUT 10
+#define MAX_RAND_VAL 0xFFFF
+
+int run = 1;
+int max_timeout = MAX_TIMEOUT;
+int ctlfd = 0;
+
+static int open_ctrl_socket(char *devname);
+static int write_msg(int fd, int msgtype, void *ptr, void *ptr2);
+static int read_msg(int fd, int msgtype, void *ptr);
+static driver_list_entry_t *active_disks[MAX_DISK_TYPES];
+
+void sig_handler(int sig)
+{
+       run = 0;        
+}
+
+static void init_driver_list(void)
+{
+       int i;
+
+       for (i = 0; i < MAX_DISK_TYPES; i++)
+               active_disks[i] = NULL;
+       return;
+}
+
+static void init_rng(void)
+{
+       static uint32_t seed;
+       struct timeval tv;
+
+       gettimeofday(&tv, NULL);
+       seed = tv.tv_usec;
+       srand48(seed);
+       return;
+}
+
+static void make_blktap_dev(char *devname, int major, int minor)
+{
+       struct stat st;
+       
+       if (lstat(devname, &st) != 0) {
+               /*Need to create device*/
+               if (mkdir(BLKTAP_DEV_DIR, 0755) == 0)
+                       DPRINTF("Created %s directory\n",BLKTAP_DEV_DIR);
+               if (mknod(devname, S_IFCHR|0600,
+                       makedev(major, minor)) == 0)
+                       DPRINTF("Created %s device\n",devname);
+       } else DPRINTF("%s device already exists\n",devname);
+}
+
+static int get_new_dev(int *major, int *minor, blkif_t *blkif)
+{
+       domid_translate_t tr;
+       int ret;
+       char *devname;
+       
+       tr.domid = blkif->domid;
+        tr.busid = (unsigned short)blkif->be_id;
+       ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr );
+       
+       if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) {
+               DPRINTF("Incorrect Dev ID [%d]\n",ret);
+               return -1;
+       }
+       
+       *minor = ret;
+       *major = ioctl(ctlfd, BLKTAP_IOCTL_MAJOR, ret );
+       if (*major < 0) {
+               DPRINTF("Incorrect Major ID [%d]\n",*major);
+               return -1;
+       }
+
+       asprintf(&devname,"%s/%s%d",BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, *minor);
+       make_blktap_dev(devname,*major,*minor); 
+       DPRINTF("Received device id %d and major %d, "
+               "sent domid %d and be_id %d\n",
+               *minor, *major, tr.domid, tr.busid);
+       return 0;
+}
+
+static int get_tapdisk_pid(blkif_t *blkif)
+{
+       int ret;
+
+       if ((ret = write_msg(blkif->fds[WRITE], CTLMSG_PID, blkif, NULL)) 
+           <= 0) {
+               DPRINTF("Write_msg failed - CTLMSG_PID(%d)\n", ret);
+               return -EINVAL;
+       }
+
+       if ((ret = read_msg(blkif->fds[READ], CTLMSG_PID_RSP, blkif))
+            <= 0) {
+               DPRINTF("Read_msg failure - CTLMSG_PID(%d)\n", ret);
+               return -EINVAL;
+       }       
+       return 1;
+}
+
+static blkif_t *test_path(char *path, char **dev, int *type)
+{
+       char *ptr, handle[10];
+       int i, size;
+
+       size = sizeof(dtypes)/sizeof(disk_info_t *);
+       *type = MAX_DISK_TYPES + 1;
+
+       if ( (ptr = strstr(path, ":"))!=NULL) {
+               memcpy(handle, path, (ptr - path));
+               *dev = ptr + 1;
+               ptr = handle + (ptr - path);
+               *ptr = '\0';
+               DPRINTF("Detected handle: [%s]\n",handle);
+
+               for (i = 0; i < size; i++) {
+                       if (strncmp(handle, dtypes[i]->handle, (ptr - path))
+                           ==0) {
+                               *type = dtypes[i]->idnum;
+
+                               if (dtypes[i]->single_handler == 1) {
+                                       /* Check whether tapdisk process 
+                                          already exists */
+                                       if (active_disks[dtypes[i]->idnum] 
+                                           == NULL) return NULL;
+                                       else 
+                                               return active_disks[dtypes[i]->idnum]->blkif;
+                               }
+                       }
+               }
+       } else *dev = NULL;
+
+       return NULL;
+}
+
+static void add_disktype(blkif_t *blkif, int type)
+{
+       driver_list_entry_t *entry, *ptr, *last;
+
+       if (type > MAX_DISK_TYPES) return;
+
+       entry = malloc(sizeof(driver_list_entry_t));
+       entry->blkif = blkif;
+       entry->next = NULL;
+       ptr = active_disks[type];
+
+       if (ptr == NULL) {
+               active_disks[type] = entry;
+               entry->prev = NULL;
+               return;
+       }
+
+       while (ptr != NULL) {
+               last = ptr;
+               ptr = ptr->next;
+       }
+
+       /*We've found the end of the list*/
+        last->next = entry;
+       entry->prev = last;
+       
+       return;
+}
+
+static int del_disktype(blkif_t *blkif)
+{
+       driver_list_entry_t *ptr, *cur, *last;
+       int type = blkif->drivertype, count = 0, close = 0;
+
+       if (type > MAX_DISK_TYPES) return 1;
+
+       ptr = active_disks[type];
+       last = NULL;
+       while (ptr != NULL) {
+               count++;
+               if (blkif == ptr->blkif) {
+                       cur = ptr;
+                       if (ptr->next != NULL) {
+                               /*There's more later in the chain*/
+                               if (!last) {
+                                       /*We're first in the list*/
+                                       active_disks[type] = ptr->next;
+                                       ptr = ptr->next;
+                                       ptr->prev = NULL;
+                               }
+                               else {
+                                       /*We're sandwiched*/
+                                       last->next = ptr->next;
+                                       ptr = ptr->next;
+                                       ptr->prev = last;
+                               }
+                               
+                       } else if (last) {
+                               /*There's more earlier in the chain*/
+                               last->next = NULL;
+                       } else {
+                               /*We're the only entry*/
+                               active_disks[type] = NULL;
+                               if(dtypes[type]->single_handler == 1) 
+                                       close = 1;
+                       }
+                       DPRINTF("DEL_DISKTYPE: Freeing entry\n");
+                       free(cur);
+                       if (dtypes[type]->single_handler == 0) close = 1;
+
+                       return close;
+               }
+               last = ptr;
+               ptr = ptr->next;
+       }
+       DPRINTF("DEL_DISKTYPE: No match\n");
+       return 1;
+}
+
+static int write_msg(int fd, int msgtype, void *ptr, void *ptr2)
+{
+       blkif_t *blkif;
+       blkif_info_t *blk;
+       msg_hdr_t *msg;
+       msg_newdev_t *msg_dev;
+       char *p, *buf, *path;
+       int msglen, len, ret;
+       fd_set writefds;
+       struct timeval timeout;
+       image_t *image, *img;
+       uint32_t seed;
+
+       blkif = (blkif_t *)ptr;
+       blk = blkif->info;
+       image = blkif->prv;
+       len = 0;
+
+       switch (msgtype)
+       {
+       case CTLMSG_PARAMS:
+               path = (char *)ptr2;
+               DPRINTF("Write_msg called: CTLMSG_PARAMS, sending [%s, %s]\n",
+                       blk->params, path);
+
+               msglen = sizeof(msg_hdr_t) + strlen(path) + 1;
+               buf = malloc(msglen);
+
+               /*Assign header fields*/
+               msg = (msg_hdr_t *)buf;
+               msg->type = CTLMSG_PARAMS;
+               msg->len = msglen;
+               msg->drivertype = blkif->drivertype;
+
+               gettimeofday(&timeout, NULL);
+               msg->cookie = blkif->cookie;
+               DPRINTF("Generated cookie, %d\n",blkif->cookie);
+
+               /*Copy blk->params to msg*/
+               p = buf + sizeof(msg_hdr_t);
+               memcpy(p, path, strlen(path) + 1);
+
+               break;
+
+       case CTLMSG_NEWDEV:
+               DPRINTF("Write_msg called: CTLMSG_NEWDEV\n");
+
+               msglen = sizeof(msg_hdr_t) + sizeof(msg_newdev_t);
+               buf = malloc(msglen);
+               
+               /*Assign header fields*/
+               msg = (msg_hdr_t *)buf;
+               msg->type = CTLMSG_NEWDEV;
+               msg->len = msglen;
+               msg->drivertype = blkif->drivertype;
+               msg->cookie = blkif->cookie;
+               
+               msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
+               msg_dev->devnum = blkif->minor;
+               msg_dev->domid = blkif->domid;
+
+               break;
+
+       case CTLMSG_CLOSE:
+               DPRINTF("Write_msg called: CTLMSG_CLOSE\n");
+
+               msglen = sizeof(msg_hdr_t);
+               buf = malloc(msglen);
+               
+               /*Assign header fields*/
+               msg = (msg_hdr_t *)buf;
+               msg->type = CTLMSG_CLOSE;
+               msg->len = msglen;
+               msg->drivertype = blkif->drivertype;
+               msg->cookie = blkif->cookie;
+               
+               break;
+
+       case CTLMSG_PID:
+               DPRINTF("Write_msg called: CTLMSG_PID\n");
+
+               msglen = sizeof(msg_hdr_t);
+               buf = malloc(msglen);
+               
+               /*Assign header fields*/
+               msg = (msg_hdr_t *)buf;
+               msg->type = CTLMSG_PID;
+               msg->len = msglen;
+               msg->drivertype = blkif->drivertype;
+               msg->cookie = blkif->cookie;
+               
+               break;
+               
+       default:
+               return -1;
+       }
+
+       /*Now send the message*/
+       ret = 0;
+       FD_ZERO(&writefds);
+       FD_SET(fd,&writefds);
+       timeout.tv_sec = max_timeout; /*Wait for up to max_timeout seconds*/
+       timeout.tv_usec = 0;
+       if (select(fd+1, (fd_set *) 0, &writefds, 
+                 (fd_set *) 0, &timeout) > 0) {
+               len = write(fd, buf, msglen);
+               if (len == -1) DPRINTF("Write failed: (%d)\n",errno);
+       }
+       free(buf);
+
+       return len;
+}
+
+static int read_msg(int fd, int msgtype, void *ptr)
+{
+       blkif_t *blkif;
+       blkif_info_t *blk;
+       msg_hdr_t *msg;
+       msg_pid_t *msg_pid;
+       char *p, *buf;
+       int msglen = MSG_SIZE, len, ret;
+       fd_set readfds;
+       struct timeval timeout;
+       image_t *image, *img;
+
+
+       blkif = (blkif_t *)ptr;
+       blk = blkif->info;
+       image = blkif->prv;
+
+       buf = malloc(MSG_SIZE);
+
+       ret = 0;
+       FD_ZERO(&readfds);
+       FD_SET(fd,&readfds);
+       timeout.tv_sec = max_timeout; /*Wait for up to max_timeout seconds*/ 
+       timeout.tv_usec = 0;
+       if (select(fd+1, &readfds,  (fd_set *) 0,
+                 (fd_set *) 0, &timeout) > 0) {
+               ret = read(fd, buf, msglen);
+               
+       }                       
+       if (ret > 0) {
+               msg = (msg_hdr_t *)buf;
+               switch (msg->type)
+               {
+               case CTLMSG_IMG:
+                       img = (image_t *)(buf + sizeof(msg_hdr_t));
+                       image->size = img->size;
+                       image->secsize = img->secsize;
+                       image->info = img->info;
+
+                       DPRINTF("Received CTLMSG_IMG: %lu, %lu, %lu\n",
+                               image->size, image->secsize, image->info);
+                       if(msgtype != CTLMSG_IMG) ret = 0;
+                       break;
+                       
+               case CTLMSG_IMG_FAIL:
+                       DPRINTF("Received CTLMSG_IMG_FAIL, "
+                               "unable to open image\n");
+                       ret = 0;
+                       break;
+                               
+               case CTLMSG_NEWDEV_RSP:
+                       DPRINTF("Received CTLMSG_NEWDEV_RSP\n");
+                       if(msgtype != CTLMSG_NEWDEV_RSP) ret = 0;
+                       break;
+                       
+               case CTLMSG_NEWDEV_FAIL:
+                       DPRINTF("Received CTLMSG_NEWDEV_FAIL\n");
+                       ret = 0;
+                       break;
+                       
+               case CTLMSG_CLOSE_RSP:
+                       DPRINTF("Received CTLMSG_CLOSE_RSP\n");
+                       if (msgtype != CTLMSG_CLOSE_RSP) ret = 0;
+                       break;
+
+               case CTLMSG_PID_RSP:
+                       DPRINTF("Received CTLMSG_PID_RSP\n");
+                       if (msgtype != CTLMSG_PID_RSP) ret = 0;
+                       else {
+                               msg_pid = (msg_pid_t *)
+                                       (buf + sizeof(msg_hdr_t));
+                               blkif->tappid = msg_pid->pid;
+                               DPRINTF("\tPID: [%d]\n",blkif->tappid);
+                       }
+                       break;
+               default:
+                       DPRINTF("UNKNOWN MESSAGE TYPE RECEIVED\n");
+                       ret = 0;
+                       break;
+               }
+       } 
+       
+       free(buf);
+       
+       return ret;
+
+}
+
+int blktapctrl_new_blkif(blkif_t *blkif)
+{
+       blkif_info_t *blk;
+       int major, minor, fd_read, fd_write, type, new;
+       char *rdctldev, *wrctldev, *cmd, *ptr;
+       image_t *image;
+       blkif_t *exist = NULL;
+
+       DPRINTF("Received a poll for a new vbd\n");
+       if ( ((blk=blkif->info) != NULL) && (blk->params != NULL) ) {
+               if (get_new_dev(&major, &minor, blkif)<0)
+                       return -1;
+
+               exist = test_path(blk->params, &ptr, &type);
+               blkif->drivertype = type;
+               blkif->cookie = lrand48() % MAX_RAND_VAL;
+
+               if (!exist) {
+                       DPRINTF("Process does not exist:\n");
+                       asprintf(&rdctldev, "/dev/xen/tapctrlread%d", minor);
+                       blkif->fds[READ] = open_ctrl_socket(rdctldev);
+
+
+                       asprintf(&wrctldev, "/dev/xen/tapctrlwrite%d", minor);
+                       blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
+                       
+                       if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) 
+                               goto fail;
+
+                       /*launch the new process*/
+                       asprintf(&cmd, "tapdisk %s %s", wrctldev, rdctldev);
+                       DPRINTF("Launching process, CMDLINE [%s]\n",cmd);
+                       if (system(cmd) == -1) {
+                               DPRINTF("Unable to fork, cmdline: [%s]\n",cmd);
+                               return -1;
+                       }
+
+                       free(rdctldev);
+                       free(wrctldev);
+                       free(cmd);
+               } else {
+                       DPRINTF("Process exists!\n");
+                       blkif->fds[READ] = exist->fds[READ];
+                       blkif->fds[WRITE] = exist->fds[WRITE];
+               }
+
+               add_disktype(blkif, type);
+               blkif->major = major;
+               blkif->minor = minor;
+
+               image = (image_t *)malloc(sizeof(image_t));
+               blkif->prv = (void *)image;
+               blkif->ops = &tapdisk_ops;
+
+               /*Retrieve the PID of the new process*/
+               if (get_tapdisk_pid(blkif) <= 0) {
+                       DPRINTF("Unable to contact disk process\n");
+                       goto fail;
+               }
+
+               /* Both of the following read and write calls will block up to 
+                * max_timeout val*/
+               if (write_msg(blkif->fds[WRITE], CTLMSG_PARAMS, blkif, ptr) 
+                   <= 0) {
+                       DPRINTF("Write_msg failed - CTLMSG_PARAMS\n");
+                       goto fail;
+               }
+
+               if (read_msg(blkif->fds[READ], CTLMSG_IMG, blkif) <= 0) {
+                       DPRINTF("Read_msg failure - CTLMSG_IMG\n");
+                       goto fail;
+               }
+
+       } else return -1;
+
+       return 0;
+fail:
+       ioctl(ctlfd, BLKTAP_IOCTL_FREEINTF, minor);
+       return -EINVAL;
+}
+
+int map_new_blktapctrl(blkif_t *blkif)
+{
+       DPRINTF("Received a poll for a new devmap\n");
+       if (write_msg(blkif->fds[WRITE], CTLMSG_NEWDEV, blkif, NULL) <= 0) {
+               DPRINTF("Write_msg failed - CTLMSG_NEWDEV\n");
+               return -EINVAL;
+       }
+
+       if (read_msg(blkif->fds[READ], CTLMSG_NEWDEV_RSP, blkif) <= 0) {
+               DPRINTF("Read_msg failed - CTLMSG_NEWDEV_RSP\n");
+               return -EINVAL;
+       }
+       DPRINTF("Exiting map_new_blktapctrl\n");
+
+       return blkif->minor - 1;
+}
+
+int unmap_blktapctrl(blkif_t *blkif)
+{
+       DPRINTF("Unmapping vbd\n");
+
+       if (write_msg(blkif->fds[WRITE], CTLMSG_CLOSE, blkif, NULL) <= 0) {
+               DPRINTF("Write_msg failed - CTLMSG_CLOSE\n");
+               return -EINVAL;
+       }
+
+       if (del_disktype(blkif)) {
+               close(blkif->fds[WRITE]);
+               close(blkif->fds[READ]);
+
+       }
+       return 0;
+}
+
+int open_ctrl_socket(char *devname)
+{
+       int ret;
+       int ipc_fd;
+       char *cmd;
+       fd_set socks;
+       struct timeval timeout;
+
+       ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO);
+       if ( (ret != 0) && (errno != EEXIST) ) {
+               DPRINTF("ERROR: pipe failed (%d)\n", errno);
+               exit(0);
+       }
+
+       ipc_fd = open(devname,O_RDWR|O_NONBLOCK);
+
+       if (ipc_fd < 0) {
+               DPRINTF("FD open failed\n");
+               return -1;
+       }
+
+       return ipc_fd;
+}
+
+static void print_drivers(void)
+{
+       int i, size;
+
+       size = sizeof(dtypes)/sizeof(disk_info_t *);
+       DPRINTF("blktapctrl: v1.0.0\n");
+       for (i = 0; i < size; i++)
+               DPRINTF("Found driver: [%s]\n",dtypes[i]->name);
+} 
+
+int main(int argc, char *argv[])
+{
+       char *devname;
+       tapdev_info_t *ctlinfo;
+       int tap_pfd, store_pfd, xs_fd, ret, timeout, pfd_count;
+       struct xs_handle *h;
+       struct pollfd  pfd[NUM_POLL_FDS];
+       pid_t process;
+
+       __init_blkif();
+       openlog("BLKTAPCTRL", LOG_CONS|LOG_ODELAY, LOG_DAEMON);
+
+       print_drivers();
+       init_driver_list();
+       init_rng();
+
+       register_new_blkif_hook(blktapctrl_new_blkif);
+       register_new_devmap_hook(map_new_blktapctrl);
+       register_new_unmap_hook(unmap_blktapctrl);
+
+       /*Attach to blktap0 */  
+       asprintf(&devname,"%s/%s0", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME);
+       make_blktap_dev(devname,254,0);
+       ctlfd = open(devname, O_RDWR);
+       if (ctlfd == -1) {
+               DPRINTF("blktap0 open failed\n");
+               goto open_failed;
+       }
+
+       /* Set up store connection and watch. */
+       h = xs_daemon_open();
+       if (h == NULL) {
+               DPRINTF("xs_daemon_open failed -- "
+                       "is xenstore running?\n");
+               goto open_failed;
+       }
+       
+       ret = add_blockdevice_probe_watch(h, "Domain-0");
+       if (ret != 0) {
+               DPRINTF("adding device probewatch\n");
+               goto open_failed;
+       }
+
+       ioctl(ctlfd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
+
+       process = getpid();
+       ret = ioctl(ctlfd, BLKTAP_IOCTL_SENDPID, process );
+
+       /*Static pollhooks*/
+       pfd_count = 0;
+       tap_pfd = pfd_count++;
+       pfd[tap_pfd].fd = ctlfd;
+       pfd[tap_pfd].events = POLLIN;
+       
+       store_pfd = pfd_count++;
+       pfd[store_pfd].fd = xs_fileno(h);
+       pfd[store_pfd].events = POLLIN;
+
+       while (run) {
+               timeout = 1000; /*Milliseconds*/
+                ret = poll(pfd, pfd_count, timeout);
+
+               if (ret > 0) {
+                       if (pfd[store_pfd].revents) {
+                               ret = xs_fire_next_watch(h);
+                       }
+               }
+       }
+
+       ioctl(ctlfd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH );
+       close(ctlfd);
+       closelog();
+
+       return 0;
+       
+ open_failed:
+       DPRINTF("Unable to start blktapctrl\n");
+       closelog();
+       return -1;
+}
diff --git a/tools/blktap/drivers/blktapctrl.h b/tools/blktap/drivers/blktapctrl.h
new file mode 100644 (file)
index 0000000..4a5e595
--- /dev/null
@@ -0,0 +1,55 @@
+/* blktapctrl.h
+ *
+ * controller image utils.
+ * 
+ * (c) 2004-6 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+static inline long int tapdisk_get_size(blkif_t *blkif)
+{
+       image_t *img = (image_t *)blkif->prv;
+       return img->size;
+}
+
+static inline long int tapdisk_get_secsize(blkif_t *blkif)
+{
+       image_t *img = (image_t *)blkif->prv;
+       return img->secsize;
+}
+
+static inline unsigned tapdisk_get_info(blkif_t *blkif)
+{
+       image_t *img = (image_t *)blkif->prv;
+       return img->info;
+}
+
+struct blkif_ops tapdisk_ops = {
+       .get_size = tapdisk_get_size,
+       .get_secsize = tapdisk_get_secsize,
+       .get_info = tapdisk_get_info,
+};
diff --git a/tools/blktap/drivers/block-aio.c b/tools/blktap/drivers/block-aio.c
new file mode 100644 (file)
index 0000000..ebcfc35
--- /dev/null
@@ -0,0 +1,327 @@
+/* block-aio.c
+ *
+ * libaio-based raw disk implementation.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * NB: This code is not thread-safe.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+#include <errno.h>
+#include <libaio.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include "tapdisk.h"
+
+
+/**
+ * We used a kernel patch to return an fd associated with the AIO context
+ * so that we can concurrently poll on synchronous and async descriptors.
+ * This is signalled by passing 1 as the io context to io_setup.
+ */
+#define REQUEST_ASYNC_FD 1
+
+#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ * 8)
+
+struct pending_aio {
+       td_callback_t cb;
+       int id;
+       void *private;
+};
+
+struct tdaio_state {
+       int fd;
+       
+       /* libaio state */
+       io_context_t       aio_ctx;
+       struct iocb        iocb_list  [MAX_AIO_REQS];
+       struct iocb       *iocb_free  [MAX_AIO_REQS];
+       struct pending_aio pending_aio[MAX_AIO_REQS];
+       int                iocb_free_count;
+       struct iocb       *iocb_queue[MAX_AIO_REQS];
+       int                iocb_queued;
+       int                poll_fd; /* NB: we require aio_poll support */
+       struct io_event    aio_events[MAX_AIO_REQS];
+};
+
+#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
+
+/*Get Image size, secsize*/
+static int get_image_info(struct td_state *s, int fd)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               s->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&s->size)!=0) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(s->size << SECTOR_SHIFT),
+                       (long long unsigned)s->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       s->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &s->sector_size);
+                       
+                       if (s->sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %ld (not %d)\n",
+                                       s->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               s->size = (stat.st_size >> SECTOR_SHIFT);
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(s->size << SECTOR_SHIFT),
+                       (long long unsigned)s->size);
+       }
+
+       if (s->size == 0) {             
+               s->size =((uint64_t) 16836057);
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       s->info = 0;
+
+       return 0;
+}
+
+/* Open the disk file and initialize aio state. */
+int tdaio_open (struct td_state *s, const char *name)
+{
+       int i, fd, ret = 0;
+       struct tdaio_state *prv = (struct tdaio_state *)s->private;
+       s->private = prv;
+
+       DPRINTF("XXX: block-aio open('%s')", name);
+       /* Initialize AIO */
+       prv->iocb_free_count = MAX_AIO_REQS;
+       prv->iocb_queued     = 0;
+       
+       prv->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
+       prv->poll_fd = io_setup(MAX_AIO_REQS, &prv->aio_ctx);
+
+       if (prv->poll_fd < 0) {
+               ret = prv->poll_fd;
+               DPRINTF("Couldn't get fd for AIO poll support.  This is "
+                       "probably because your kernel does not have the "
+                       "aio-poll patch applied.\n");
+               goto done;
+       }
+
+       for (i=0;i<MAX_AIO_REQS;i++)
+               prv->iocb_free[i] = &prv->iocb_list[i];
+
+       /* Open the file */
+        fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE);
+
+        if ( (fd == -1) && (errno == EINVAL) ) {
+
+                /* Maybe O_DIRECT isn't supported. */
+                fd = open(name, O_RDWR | O_LARGEFILE);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno);
+               ret = 0 - errno;
+               goto done;
+        }
+
+        prv->fd = fd;
+
+       ret = get_image_info(s, fd);
+done:
+       return ret;     
+}
+
+int tdaio_queue_read(struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct   iocb *io;
+       struct   pending_aio *pio;
+       struct   tdaio_state *prv = (struct tdaio_state *)s->private;
+       int      size    = nb_sectors * s->sector_size;
+       uint64_t offset  = sector * (uint64_t)s->sector_size;
+       long     ioidx;
+       
+       if (prv->iocb_free_count == 0)
+               return -ENOMEM;
+       io = prv->iocb_free[--prv->iocb_free_count];
+       
+       ioidx = IOCB_IDX(prv, io);
+       pio = &prv->pending_aio[ioidx];
+       pio->cb = cb;
+       pio->id = id;
+       pio->private = private;
+       
+       io_prep_pread(io, prv->fd, buf, size, offset);
+       io->data = (void *)ioidx;
+       
+       prv->iocb_queue[prv->iocb_queued++] = io;
+       
+       return 0;
+}
+                       
+int tdaio_queue_write(struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct   iocb *io;
+       struct   pending_aio *pio;
+       struct   tdaio_state *prv = (struct tdaio_state *)s->private;
+       int      size    = nb_sectors * s->sector_size;
+       uint64_t offset  = sector * (uint64_t)s->sector_size;
+       long     ioidx;
+       
+       if (prv->iocb_free_count == 0)
+               return -ENOMEM;
+       io = prv->iocb_free[--prv->iocb_free_count];
+       
+       ioidx = IOCB_IDX(prv, io);
+       pio = &prv->pending_aio[ioidx];
+       pio->cb = cb;
+       pio->id = id;
+       pio->private = private;
+       
+       io_prep_pwrite(io, prv->fd, buf, size, offset);
+       io->data = (void *)ioidx;
+       
+       prv->iocb_queue[prv->iocb_queued++] = io;
+       
+       return 0;
+}
+                       
+int tdaio_submit(struct td_state *s)
+{
+       int ret;
+       struct   tdaio_state *prv = (struct tdaio_state *)s->private;
+
+       ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
+       
+       /* XXX: TODO: Handle error conditions here. */
+       
+       /* Success case: */
+       prv->iocb_queued = 0;
+       
+       return ret;
+}
+
+int *tdaio_get_fd(struct td_state *s)
+{
+       struct tdaio_state *prv = (struct tdaio_state *)s->private;
+       int *fds, i;
+
+       fds = malloc(sizeof(int) * MAX_IOFD);
+       /*initialise the FD array*/
+       for(i=0;i<MAX_IOFD;i++) fds[i] = 0;
+
+       fds[0] = prv->poll_fd;
+
+       return fds;     
+}
+
+int tdaio_close(struct td_state *s)
+{
+       struct tdaio_state *prv = (struct tdaio_state *)s->private;
+       
+       io_destroy(prv->aio_ctx);
+       close(prv->fd);
+       
+       return 0;
+}
+
+int tdaio_do_callbacks(struct td_state *s, int sid)
+{
+       int ret, i, rsp = 0;
+       struct io_event *ep;
+       struct tdaio_state *prv = (struct tdaio_state *)s->private;
+
+       /* Non-blocking test for completed io. */
+       ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events,
+                          NULL);
+                       
+       for (ep=prv->aio_events,i=ret; i-->0; ep++) {
+               struct iocb        *io  = ep->obj;
+               struct pending_aio *pio;
+               
+               pio = &prv->pending_aio[(long)io->data];
+               
+               if (ep->res != io->u.c.nbytes) {
+                       /* TODO: handle this case better. */
+                       DPRINTF("AIO did less than I asked it to. \n");
+               }
+               rsp += pio->cb(s, ep->res2, pio->id, pio->private);
+
+               prv->iocb_free[prv->iocb_free_count++] = io;
+       }
+       return rsp;
+}
+       
+struct tap_disk tapdisk_aio = {
+       "tapdisk_aio",
+       sizeof(struct tdaio_state),
+       tdaio_open,
+       tdaio_queue_read,
+       tdaio_queue_write,
+       tdaio_submit,
+       tdaio_get_fd,
+       tdaio_close,
+       tdaio_do_callbacks,
+};
diff --git a/tools/blktap/drivers/block-qcow.c b/tools/blktap/drivers/block-qcow.c
new file mode 100644 (file)
index 0000000..7eab8c9
--- /dev/null
@@ -0,0 +1,1369 @@
+/* block-qcow.c
+ *
+ * Asynchronous Qemu copy-on-write disk implementation.
+ * Code based on the Qemu implementation
+ * (see copyright notice below)
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ */
+
+/*
+ * Block driver for the QCOW format
+ * 
+ * Copyright (c) 2004 Fabrice Bellard
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files(the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+#include <openssl/md5.h>
+#include "bswap.h"
+#include "aes.h"
+#include "tapdisk.h"
+
+#if 1
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+
+/******AIO DEFINES******/
+#define REQUEST_ASYNC_FD 1
+#define MAX_QCOW_IDS  0xFFFF
+#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ * 8)
+
+struct pending_aio {
+        td_callback_t cb;
+        int id;
+        void *private;
+       int nb_sectors;
+       char *buf;
+       uint64_t sector;
+       int qcow_idx;
+};
+
+#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
+
+#define ZERO_TEST(_b) (_b | 0x00)
+
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define XEN_MAGIC  (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+typedef struct QCowHeader {
+       uint32_t magic;
+       uint32_t version;
+       uint64_t backing_file_offset;
+       uint32_t backing_file_size;
+       uint32_t mtime;
+       uint64_t size; /* in bytes */
+       uint8_t cluster_bits;
+       uint8_t l2_bits;
+       uint32_t crypt_method;
+       uint64_t l1_table_offset;
+} QCowHeader;
+
+/*Extended header for Xen enhancements*/
+typedef struct QCowHeader_ext {
+        uint32_t xmagic;
+        uint32_t cksum;
+        uint32_t min_cluster_alloc;
+} QCowHeader_ext;
+
+#define L2_CACHE_SIZE 16  /*Fixed allocation in Qemu*/
+
+struct tdqcow_state {
+        int fd;                        /*Main Qcow file descriptor */
+       uint64_t fd_end;               /*Store a local record of file length */
+       int bfd;                       /*Backing file descriptor*/
+       char *name;                    /*Record of the filename*/
+       int poll_pipe[2];              /*dummy fd for polling on */
+       int encrypted;                 /*File contents are encrypted or plain*/
+       int cluster_bits;              /*Determines length of cluster as 
+                                       *indicated by file hdr*/
+       int cluster_size;              /*Length of cluster*/
+       int cluster_sectors;           /*Number of sectors per cluster*/
+       int cluster_alloc;             /*Blktap fix for allocating full 
+                                       *extents*/
+       int min_cluster_alloc;         /*Blktap historical extent alloc*/
+       int l2_bits;                   /*Size of L2 table entry*/
+       int l2_size;                   /*Full table size*/
+       int l1_size;                   /*L1 table size*/
+       uint64_t cluster_offset_mask;    
+       uint64_t l1_table_offset;      /*L1 table offset from beginning of 
+                                       *file*/
+       uint64_t *l1_table;            /*L1 table entries*/
+       uint64_t *l2_cache;            /*We maintain a cache of size 
+                                       *L2_CACHE_SIZE of most read entries*/
+       uint64_t l2_cache_offsets[L2_CACHE_SIZE];     /*L2 cache entries*/
+       uint32_t l2_cache_counts[L2_CACHE_SIZE];      /*Cache access record*/
+       uint8_t *cluster_cache;          
+       uint8_t *cluster_data;
+       uint8_t *sector_lock;          /*Locking bitmap for AIO reads/writes*/
+       uint64_t cluster_cache_offset; /**/
+       uint32_t crypt_method;         /*current crypt method, 0 if no 
+                                       *key yet */
+       uint32_t crypt_method_header;  /**/
+       AES_KEY aes_encrypt_key;       /*AES key*/
+       AES_KEY aes_decrypt_key;       /*AES key*/
+        /* libaio state */
+        io_context_t       aio_ctx;
+       int                nr_reqs [MAX_QCOW_IDS];
+        struct iocb        iocb_list  [MAX_AIO_REQS];
+        struct iocb       *iocb_free  [MAX_AIO_REQS];
+        struct pending_aio pending_aio[MAX_AIO_REQS];
+        int                iocb_free_count;
+        struct iocb       *iocb_queue[MAX_AIO_REQS];
+        int                iocb_queued;
+        int                poll_fd;      /* NB: we require aio_poll support */
+        struct io_event    aio_events[MAX_AIO_REQS];
+};
+
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
+
+static int init_aio_state(struct td_state *bs)
+{
+        int i;
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+        long     ioidx;
+
+        /*Initialize Locking bitmap*/
+       s->sector_lock = calloc(1, bs->size);
+       
+       if (!s->sector_lock) {
+               DPRINTF("Failed to allocate sector lock\n");
+               goto fail;
+       }
+
+        /* Initialize AIO */
+        s->iocb_free_count = MAX_AIO_REQS;
+        s->iocb_queued     = 0;
+
+        /*Signal kernel to create Poll FD for Asyc completion events*/
+        s->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;   
+        s->poll_fd = io_setup(MAX_AIO_REQS, &s->aio_ctx);
+
+        if (s->poll_fd < 0) {
+                DPRINTF("Retrieving Async poll fd failed\n");
+               goto fail;
+        }
+
+        for (i=0;i<MAX_AIO_REQS;i++)
+                s->iocb_free[i] = &s->iocb_list[i];
+       for (i=0;i<MAX_QCOW_IDS;i++)
+               s->nr_reqs[i] = 0;
+        DPRINTF("AIO state initialised\n");
+
+        return 0;
+
+ fail:
+       return -1;
+}
+
+/*
+ *Test if block is zero. 
+ * Return: 
+ *       1 for TRUE
+ *       0 for FALSE
+ */
+static inline int IS_ZERO(char *buf, int len)
+{
+       int i;
+
+       for (i = 0; i < len; i++) {
+               /*if not zero, return false*/
+               if (ZERO_TEST(*(buf + i))) return 0; 
+       }
+       return 1;
+}
+
+static uint32_t gen_cksum(char *ptr, int len)
+{
+       unsigned char *md;
+       uint32_t ret;
+
+       md = malloc(MD5_DIGEST_LENGTH);
+
+       if(!md) return 0;
+
+       if (MD5((unsigned char *)ptr, len, md) != md) return 0;
+
+       memcpy(&ret, md, sizeof(uint32_t));
+       free(md);
+       return ret;
+}
+
+static int qcow_set_key(struct td_state *bs, const char *key)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       uint8_t keybuf[16];
+       int len, i;
+       
+       memset(keybuf, 0, 16);
+       len = strlen(key);
+       if (len > 16)
+               len = 16;
+       /* XXX: we could compress the chars to 7 bits to increase
+          entropy */
+       for (i = 0; i < len; i++) {
+               keybuf[i] = key[i];
+       }
+       s->crypt_method = s->crypt_method_header;
+       
+       if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+               return -1;
+       if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+               return -1;
+#if 0
+       /* test */
+       {
+               uint8_t in[16];
+               uint8_t out[16];
+               uint8_t tmp[16];
+               for (i=0; i<16; i++)
+                       in[i] = i;
+               AES_encrypt(in, tmp, &s->aes_encrypt_key);
+               AES_decrypt(tmp, out, &s->aes_decrypt_key);
+               for (i = 0; i < 16; i++)
+                       DPRINTF(" %02x", tmp[i]);
+               DPRINTF("\n");
+               for (i = 0; i < 16; i++)
+                       DPRINTF(" %02x", out[i]);
+               DPRINTF("\n");
+       }
+#endif
+       return 0;
+}
+
+static int async_read(struct tdqcow_state *s, int fd, int size, 
+                    uint64_t offset,
+                    char *buf, td_callback_t cb,
+                    int id, uint64_t sector, int qcow_idx, void *private)
+{
+        struct   iocb *io;
+        struct   pending_aio *pio;
+       long     ioidx;
+
+        io = s->iocb_free[--s->iocb_free_count];
+
+        ioidx = IOCB_IDX(s, io);
+        pio = &s->pending_aio[ioidx];
+        pio->cb = cb;
+        pio->id = id;
+        pio->private = private;
+       pio->nb_sectors = size/512;
+       pio->buf = buf;
+       pio->sector = sector;
+       pio->qcow_idx = qcow_idx;
+
+        io_prep_pread(io, fd, buf, size, offset);
+        io->data = (void *)ioidx;
+
+        s->iocb_queue[s->iocb_queued++] = io;
+
+        return 1;
+}
+
+static int async_write(struct tdqcow_state *s, int fd, int size, 
+                    uint64_t offset,
+                    char *buf, td_callback_t cb,
+                     int id, uint64_t sector, int qcow_idx, void *private)
+{
+        struct   iocb *io;
+        struct   pending_aio *pio;
+       long     ioidx;
+
+        io = s->iocb_free[--s->iocb_free_count];
+
+        ioidx = IOCB_IDX(s, io);
+        pio = &s->pending_aio[ioidx];
+        pio->cb = cb;
+        pio->id = id;
+        pio->private = private;
+       pio->nb_sectors = size/512;
+       pio->buf = buf;
+       pio->sector = sector;
+       pio->qcow_idx = qcow_idx;
+
+        io_prep_pwrite(io, fd, buf, size, offset);
+        io->data = (void *)ioidx;
+
+        s->iocb_queue[s->iocb_queued++] = io;
+
+        return 1;
+}
+
+/*TODO: Fix sector span!*/
+static int aio_can_lock(struct tdqcow_state *s, uint64_t sector)
+{
+       return (s->sector_lock[sector] ? 0 : 1);
+}
+
+static int aio_lock(struct tdqcow_state *s, uint64_t sector)
+{
+       return ++s->sector_lock[sector];
+}
+
+static void aio_unlock(struct tdqcow_state *s, uint64_t sector)
+{
+       if (!s->sector_lock[sector]) return;
+
+       --s->sector_lock[sector];
+       return;
+}
+
+/*TODO - Use a freelist*/
+static int get_free_idx(struct tdqcow_state *s)
+{
+       int i;
+       
+       for(i = 0; i < MAX_QCOW_IDS; i++) {
+               if(s->nr_reqs[i] == 0) return i;
+       }
+       return -1;
+}
+
+/* 
+ * The crypt function is compatible with the linux cryptoloop
+ * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ * supported .
+ */
+static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+       union {
+               uint64_t ll[2];
+               uint8_t b[16];
+       } ivec;
+       int i;
+       
+       for (i = 0; i < nb_sectors; i++) {
+               ivec.ll[0] = cpu_to_le64(sector_num);
+               ivec.ll[1] = 0;
+               AES_cbc_encrypt(in_buf, out_buf, 512, key, 
+                               ivec.b, enc);
+               sector_num++;
+               in_buf += 512;
+               out_buf += 512;
+       }
+}
+
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size 
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(struct td_state *bs,
+                                   uint64_t offset, int allocate,
+                                   int compressed_size,
+                                   int n_start, int n_end)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
+       char *tmp_ptr, *tmp_ptr2, *l2_ptr, *l1_ptr;
+       uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+       uint32_t min_count;
+       int new_l2_table;
+
+       /*Check L1 table for the extent offset*/
+       l1_index = offset >> (s->l2_bits + s->cluster_bits);
+       l2_offset = s->l1_table[l1_index];
+       new_l2_table = 0;
+       if (!l2_offset) {
+               if (!allocate)
+                       return 0;
+               /* 
+                * allocating a new l2 entry + extent 
+                * at the end of the file, we must also
+                * update the L1 entry safely.
+                */
+               l2_offset = s->fd_end;
+
+               /* round to cluster size */
+               l2_offset = (l2_offset + s->cluster_size - 1) 
+                       & ~(s->cluster_size - 1);
+
+               /* update the L1 entry */
+               s->l1_table[l1_index] = l2_offset;
+               tmp = cpu_to_be64(l2_offset);
+               
+               /*Truncate file for L2 table 
+                *(initialised to zero in case we crash)*/
+               ftruncate(s->fd, l2_offset + (s->l2_size * sizeof(uint64_t)));
+               s->fd_end += (s->l2_size * sizeof(uint64_t));
+
+               /*Update the L1 table entry on disk
+                 * (for O_DIRECT we write 4KByte blocks)*/
+               l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
+               l1_ptr = (char *)s->l1_table + (l1_sector << 12);
+
+               if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
+                       DPRINTF("ERROR allocating memory for L1 table\n");
+               }
+               memcpy(tmp_ptr, l1_ptr, 4096);
+
+               /*
+                * Issue non-asynchronous L1 write.
+                * For safety, we must ensure that
+                * entry is written before blocks.
+                */
+               lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
+               if (write(s->fd, tmp_ptr, 4096) != 4096)
+                       return 0;
+               free(tmp_ptr);
+
+               new_l2_table = 1;
+               goto cache_miss;
+       } else if (s->min_cluster_alloc == s->l2_size) {
+               /*Fast-track the request*/
+               cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
+               l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+               return cluster_offset + (l2_index * s->cluster_size);
+       }
+
+       /*Check to see if L2 entry is already cached*/
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (l2_offset == s->l2_cache_offsets[i]) {
+                       /* increment the hit count */
+                       if (++s->l2_cache_counts[i] == 0xffffffff) {
+                               for (j = 0; j < L2_CACHE_SIZE; j++) {
+                                       s->l2_cache_counts[j] >>= 1;
+                               }
+                       }
+                       l2_table = s->l2_cache + (i << s->l2_bits);
+                       goto found;
+               }
+       }
+
+cache_miss:
+       /* not found: load a new entry in the least used one */
+       min_index = 0;
+       min_count = 0xffffffff;
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (s->l2_cache_counts[i] < min_count) {
+                       min_count = s->l2_cache_counts[i];
+                       min_index = i;
+               }
+       }
+       l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+       /*If extent pre-allocated, read table from disk, 
+        *otherwise write new table to disk*/
+       if (new_l2_table) {
+               /*Should we allocate the whole extent? Adjustable parameter.*/
+               if (s->cluster_alloc == s->l2_size) {
+                       cluster_offset = l2_offset + 
+                               (s->l2_size * sizeof(uint64_t));
+                       cluster_offset = (cluster_offset + s->cluster_size - 1)
+                               & ~(s->cluster_size - 1);
+                       ftruncate(s->fd, cluster_offset + 
+                                 (s->cluster_size * s->l2_size));
+                       s->fd_end = cluster_offset + 
+                               (s->cluster_size * s->l2_size);
+                       for (i = 0; i < s->l2_size; i++) {
+                               l2_table[i] = cpu_to_be64(cluster_offset + 
+                                                         (i*s->cluster_size));
+                       }  
+               } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+
+               lseek(s->fd, l2_offset, SEEK_SET);
+               if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
+                   s->l2_size * sizeof(uint64_t))
+                       return 0;
+       } else {
+               lseek(s->fd, l2_offset, SEEK_SET);
+               if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != 
+                   s->l2_size * sizeof(uint64_t))
+                       return 0;
+       }
+       
+       /*Update the cache entries*/ 
+       s->l2_cache_offsets[min_index] = l2_offset;
+       s->l2_cache_counts[min_index] = 1;
+
+found:
+       /*The extent is split into 's->l2_size' blocks of 
+        *size 's->cluster_size'*/
+       l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+       cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+       if (!cluster_offset || 
+           ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
+               if (!allocate)
+                       return 0;
+               
+               if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+                   (n_end - n_start) < s->cluster_sectors) {
+                       /* cluster is already allocated but compressed, we must
+                          decompress it in the case it is not completely
+                          overwritten */
+                       if (decompress_cluster(s, cluster_offset) < 0)
+                               return 0;
+                       cluster_offset = lseek(s->fd, 0, SEEK_END);
+                       cluster_offset = (cluster_offset + s->cluster_size - 1)
+                               & ~(s->cluster_size - 1);
+                       /* write the cluster content - not asynchronous */
+                       lseek(s->fd, cluster_offset, SEEK_SET);
+                       if (write(s->fd, s->cluster_cache, s->cluster_size) != 
+                           s->cluster_size)
+                           return -1;
+               } else {
+                       /* allocate a new cluster */
+                       cluster_offset = lseek(s->fd, 0, SEEK_END);
+                       if (allocate == 1) {
+                               /* round to cluster size */
+                               cluster_offset = 
+                                       (cluster_offset + s->cluster_size - 1) 
+                                       & ~(s->cluster_size - 1);
+                               ftruncate(s->fd, cluster_offset + 
+                                         s->cluster_size);
+                               /* if encrypted, we must initialize the cluster
+                                  content which won't be written */
+                               if (s->crypt_method && 
+                                   (n_end - n_start) < s->cluster_sectors) {
+                                       uint64_t start_sect;
+                                       start_sect = (offset & 
+                                                     ~(s->cluster_size - 1)) 
+                                                             >> 9;
+                                       memset(s->cluster_data + 512, 
+                                              0xaa, 512);
+                                       for (i = 0; i < s->cluster_sectors;i++)
+                                       {
+                                               if (i < n_start || i >= n_end) 
+                                               {
+                                                       encrypt_sectors(s, start_sect + i, 
+                                                                       s->cluster_data, 
+                                                                       s->cluster_data + 512, 1, 1,
+                                                                       &s->aes_encrypt_key);
+                                                       lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
+                                                       if (write(s->fd, s->cluster_data, 512) != 512)
+                                                               return -1;
+                                               }
+                                       }
+                               }
+                       } else {
+                               cluster_offset |= QCOW_OFLAG_COMPRESSED | 
+                                       (uint64_t)compressed_size 
+                                               << (63 - s->cluster_bits);
+                       }
+               }
+               /* update L2 table */
+               tmp = cpu_to_be64(cluster_offset);
+               l2_table[l2_index] = tmp;
+
+               /*For IO_DIRECT we write 4KByte blocks*/
+               l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
+               l2_ptr = (char *)l2_table + (l2_sector << 12);
+               
+               if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
+                       DPRINTF("ERROR allocating memory for L1 table\n");
+               }
+               memcpy(tmp_ptr2, l2_ptr, 4096);
+               aio_lock(s, offset >> 9);
+               async_write(s, s->fd, 4096, l2_offset + (l2_sector << 12), 
+                           tmp_ptr2, 0, -2, offset >> 9, 0, NULL);
+       }
+       return cluster_offset;
+}
+
+static void init_cluster_cache(struct td_state *bs)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       uint32_t count = 0;
+       int i, cluster_entries;
+
+       cluster_entries = s->cluster_size / 512;
+       DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n",
+               cluster_entries, s->cluster_size);
+
+       for (i = 0; i < bs->size; i += cluster_entries) {
+               if (get_cluster_offset(bs, i << 9, 0, 0, 0, 1)) count++;
+               if (count >= L2_CACHE_SIZE) return;
+       }
+       DPRINTF("Finished cluster initialisation, added %d entries\n", count);
+       return;
+}
+
+static int qcow_is_allocated(struct td_state *bs, int64_t sector_num, 
+                             int nb_sectors, int *pnum)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+
+       int index_in_cluster, n;
+       uint64_t cluster_offset;
+
+       cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+       index_in_cluster = sector_num & (s->cluster_sectors - 1);
+       n = s->cluster_sectors - index_in_cluster;
+       if (n > nb_sectors)
+               n = nb_sectors;
+       *pnum = n;
+       return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+       z_stream strm1, *strm = &strm1;
+       int ret, out_len;
+       
+       memset(strm, 0, sizeof(*strm));
+       
+       strm->next_in = (uint8_t *)buf;
+       strm->avail_in = buf_size;
+       strm->next_out = out_buf;
+       strm->avail_out = out_buf_size;
+       
+       ret = inflateInit2(strm, -12);
+       if (ret != Z_OK)
+               return -1;
+       ret = inflate(strm, Z_FINISH);
+       out_len = strm->next_out - out_buf;
+       if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+           (out_len != out_buf_size) ) {
+               inflateEnd(strm);
+               return -1;
+       }
+       inflateEnd(strm);
+       return 0;
+}
+                              
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
+{
+       int ret, csize;
+       uint64_t coffset;
+
+       coffset = cluster_offset & s->cluster_offset_mask;
+       if (s->cluster_cache_offset != coffset) {
+               csize = cluster_offset >> (63 - s->cluster_bits);
+               csize &= (s->cluster_size - 1);
+               lseek(s->fd, coffset, SEEK_SET);
+               ret = read(s->fd, s->cluster_data, csize);
+               if (ret != csize) 
+                       return -1;
+               if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                                     s->cluster_data, csize) < 0) {
+                       return -1;
+               }
+               s->cluster_cache_offset = coffset;
+       }
+       return 0;
+}
+
+/* Open the disk file and initialize qcow state. */
+int tdqcow_open (struct td_state *bs, const char *name)
+{
+       int fd, len, i, shift, ret, size, l1_table_size;
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       char *buf;
+       QCowHeader *header;
+       QCowHeader_ext *exthdr;
+       uint32_t cksum;
+
+       DPRINTF("QCOW: Opening %s\n",name);
+       /* set up a pipe so that we can hand back a poll fd that won't fire.*/
+       ret = pipe(s->poll_pipe);
+       if (ret != 0)
+               return (0 - errno);
+
+       fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE);
+       if (fd < 0) {
+               DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
+               return -1;
+       }
+
+       s->fd = fd;
+       asprintf(&s->name,"%s", name);
+
+       ASSERT(sizeof(header) < 512);
+
+       ret = posix_memalign((void **)&buf, 512, 512);
+       if (ret != 0) goto fail;
+
+       if (read(fd, buf, 512) != 512)
+               goto fail;
+
+       header = (QCowHeader *)buf;
+       be32_to_cpus(&header->magic);
+       be32_to_cpus(&header->version);
+       be64_to_cpus(&header->backing_file_offset);
+       be32_to_cpus(&header->backing_file_size);
+       be32_to_cpus(&header->mtime);
+       be64_to_cpus(&header->size);
+       be32_to_cpus(&header->crypt_method);
+       be64_to_cpus(&header->l1_table_offset);
+   
+       if (header->magic != QCOW_MAGIC || header->version > QCOW_VERSION)
+               goto fail;
+       if (header->size <= 1 || header->cluster_bits < 9)
+               goto fail;
+       if (header->crypt_method > QCOW_CRYPT_AES)
+               goto fail;
+       s->crypt_method_header = header->crypt_method;
+       if (s->crypt_method_header)
+               s->encrypted = 1;
+       s->cluster_bits = header->cluster_bits;
+       s->cluster_size = 1 << s->cluster_bits;
+       s->cluster_sectors = 1 << (s->cluster_bits - 9);
+       s->l2_bits = header->l2_bits;
+       s->l2_size = 1 << s->l2_bits;
+       s->cluster_alloc = s->l2_size;
+       bs->size = header->size / 512;
+       s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+       
+       /* read the level 1 table */
+       shift = s->cluster_bits + s->l2_bits;
+       s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
+       
+       s->l1_table_offset = header->l1_table_offset;
+
+       /*allocate a 4Kbyte multiple of memory*/
+       l1_table_size = s->l1_size * sizeof(uint64_t);
+       if (l1_table_size % 4096 > 0) {
+               l1_table_size = ((l1_table_size >> 12) + 1) << 12;
+       }
+       ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
+       if (ret != 0) goto fail;
+       memset(s->l1_table, 0x00, l1_table_size);
+
+       DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n",
+               (long long)s->l1_table_offset,
+               (int) (s->l1_size * sizeof(uint64_t)), 
+               l1_table_size);
+
+       lseek(fd, s->l1_table_offset, SEEK_SET);
+       if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
+               goto fail;
+/*     for(i = 0;i < s->l1_size; i++) {
+               //be64_to_cpus(&s->l1_table[i]);
+               DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]);
+               }*/
+
+       /* alloc L2 cache */
+       size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
+       ret = posix_memalign((void **)&s->l2_cache, 4096, size);
+       if(ret != 0) goto fail;
+
+       size = s->cluster_size;
+       ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
+       if(ret != 0) goto fail;
+
+       ret = posix_memalign((void **)&s->cluster_data, 4096, size);
+       if(ret != 0) goto fail;
+       s->cluster_cache_offset = -1;
+
+       /* read the backing file name */
+       s->bfd = -1;
+       if (header->backing_file_offset != 0) {
+               DPRINTF("Reading backing file data\n");
+               len = header->backing_file_size;
+               if (len > 1023)
+                       len = 1023;
+
+                /*TODO - Fix read size for O_DIRECT and use original fd!*/
+               fd = open(name, O_RDONLY | O_LARGEFILE);
+
+               lseek(fd, header->backing_file_offset, SEEK_SET);
+               if (read(fd, bs->backing_file, len) != len)
+                       goto fail;
+               bs->backing_file[len] = '\0';
+               close(fd);
+               /***********************************/
+
+               /*Open backing file*/
+               fd = open(bs->backing_file, O_RDONLY | O_DIRECT | O_LARGEFILE);
+               if (fd < 0) {
+                       DPRINTF("Unable to open backing file: %s\n",
+                               bs->backing_file);
+                       goto fail;
+               }
+               s->bfd = fd;
+               s->cluster_alloc = 1; /*Cannot use pre-alloc*/
+       }
+
+        bs->sector_size = 512;
+        bs->info = 0;
+       
+       /*Detect min_cluster_alloc*/
+       s->min_cluster_alloc = 1; /*Default*/
+       if (s->bfd == -1 && (s->l1_table_offset % 4096 == 0) ) {
+               /*We test to see if the xen magic # exists*/
+               exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
+               be32_to_cpus(&exthdr->xmagic);
+               if(exthdr->xmagic != XEN_MAGIC) 
+                       goto end_xenhdr;
+
+               /*Finally check the L1 table cksum*/
+               be32_to_cpus(&exthdr->cksum);
+               cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
+               if(exthdr->cksum != cksum)
+                       goto end_xenhdr;
+                       
+               be32_to_cpus(&exthdr->min_cluster_alloc);
+               s->min_cluster_alloc = exthdr->min_cluster_alloc; 
+       }
+
+ end_xenhdr:
+       if (init_aio_state(bs)!=0) {
+               DPRINTF("Unable to initialise AIO state\n");
+               goto fail;
+       }
+       s->fd_end = lseek(s->fd, 0, SEEK_END);
+
+       return 0;
+       
+fail:
+       DPRINTF("QCOW Open failed\n");
+       free(s->l1_table);
+       free(s->l2_cache);
+       free(s->cluster_cache);
+       free(s->cluster_data);
+       close(fd);
+       return -1;
+}
+
+ int tdqcow_queue_read(struct td_state *bs, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       int ret = 0, index_in_cluster, n, i, qcow_idx, asubmit = 0;
+       uint64_t cluster_offset;
+
+       /*Check we can get a lock*/
+       for (i = 0; i < nb_sectors; i++)
+               if (!aio_can_lock(s, sector + i)) {
+                       DPRINTF("AIO_CAN_LOCK failed [%llu]\n", 
+                               (long long) sector + i);
+                       return -EBUSY;
+               }
+       
+       /*We store a local record of the request*/
+       qcow_idx = get_free_idx(s);
+       while (nb_sectors > 0) {
+               cluster_offset = 
+                       get_cluster_offset(bs, sector << 9, 0, 0, 0, 0);
+               index_in_cluster = sector & (s->cluster_sectors - 1);
+               n = s->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+
+               if (s->iocb_free_count == 0 || !aio_lock(s, sector)) {
+                       DPRINTF("AIO_LOCK or iocb_free_count (%d) failed" 
+                               "[%llu]\n", s->iocb_free_count, 
+                               (long long) sector);
+                       return -ENOMEM;
+               }
+               
+               if (!cluster_offset && (s->bfd > 0)) {
+                       s->nr_reqs[qcow_idx]++;
+                       asubmit += async_read(s, s->bfd, n * 512, sector << 9, 
+                                             buf, cb, id, sector, 
+                                             qcow_idx, private);
+               } else if(!cluster_offset) {
+                       memset(buf, 0, 512 * n);
+                       aio_unlock(s, sector);
+               } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+                       if (decompress_cluster(s, cluster_offset) < 0) {
+                               ret = -1;
+                               goto done;
+                       }
+                       memcpy(buf, s->cluster_cache + index_in_cluster * 512, 
+                              512 * n);
+               } else {                        
+                       s->nr_reqs[qcow_idx]++;
+                       asubmit += async_read(s, s->fd, n * 512, 
+                                             (cluster_offset + 
+                                              index_in_cluster * 512), 
+                                             buf, cb, id, sector, 
+                                             qcow_idx, private);
+               }
+               nb_sectors -= n;
+               sector += n;
+               buf += n * 512;
+       }
+done:
+        /*Callback if no async requests outstanding*/
+        if (!asubmit) return cb(bs, ret == -1 ? -1 : 0, id, private);
+
+       return 0;
+}
+
+ int tdqcow_queue_write(struct td_state *bs, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       int ret = 0, index_in_cluster, n, i, qcow_idx, asubmit = 0;
+       uint64_t cluster_offset;
+
+       /*Check we can get a lock*/
+       for (i = 0; i < nb_sectors; i++)
+               if (!aio_can_lock(s, sector + i))  {
+                       DPRINTF("AIO_CAN_LOCK failed [%llu]\n", 
+                               (long long) (sector + i));
+                       return -EBUSY;
+               }
+                  
+       /*We store a local record of the request*/
+       qcow_idx = get_free_idx(s);     
+       while (nb_sectors > 0) {
+               index_in_cluster = sector & (s->cluster_sectors - 1);
+               n = s->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+
+               if (s->iocb_free_count == 0 || !aio_lock(s, sector)){
+                       DPRINTF("AIO_LOCK or iocb_free_count (%d) failed" 
+                               "[%llu]\n", s->iocb_free_count, 
+                               (long long) sector);
+                       return -ENOMEM;
+               }
+
+               if (!IS_ZERO(buf,n * 512)) {
+
+                       cluster_offset = get_cluster_offset(bs, sector << 9, 
+                                                           1, 0, 
+                                                           index_in_cluster, 
+                                                           index_in_cluster+n
+                               );
+                       if (!cluster_offset) {
+                               DPRINTF("Ooops, no write cluster offset!\n");
+                               ret = -1;
+                               goto done;
+                       }
+
+                       if (s->crypt_method) {
+                               encrypt_sectors(s, sector, s->cluster_data, 
+                                               (unsigned char *)buf, n, 1,
+                                               &s->aes_encrypt_key);
+                               s->nr_reqs[qcow_idx]++;
+                               asubmit += async_write(s, s->fd, n * 512, 
+                                                      (cluster_offset + 
+                                                       index_in_cluster*512), 
+                                                      (char *)s->cluster_data,
+                                                      cb, id, sector, 
+                                                      qcow_idx, private);
+                       } else {
+                               s->nr_reqs[qcow_idx]++;
+                               asubmit += async_write(s, s->fd, n * 512, 
+                                                      (cluster_offset + 
+                                                       index_in_cluster*512),
+                                                      buf, cb, id, sector, 
+                                                      qcow_idx, private);
+                       }
+               } else {
+                       /*Write data contains zeros, but we must check to see 
+                         if cluster already allocated*/
+                       cluster_offset = get_cluster_offset(bs, sector << 9, 
+                                                           0, 0, 
+                                                           index_in_cluster, 
+                                                           index_in_cluster+n
+                               );      
+                       if(cluster_offset) {
+                               if (s->crypt_method) {
+                                       encrypt_sectors(s, sector, 
+                                                       s->cluster_data, 
+                                                       (unsigned char *)buf, 
+                                                       n, 1,
+                                                       &s->aes_encrypt_key);
+                                       s->nr_reqs[qcow_idx]++;
+                                       asubmit += async_write(s, s->fd, 
+                                                              n * 512, 
+                                                              (cluster_offset+
+                                                               index_in_cluster * 512), 
+                                                              (char *)s->cluster_data, cb, id, sector, 
+                                                              qcow_idx, private);
+                               } else {
+                                       s->nr_reqs[qcow_idx]++;
+                                       asubmit += async_write(s, s->fd, n*512,
+                                                              cluster_offset + index_in_cluster * 512, 
+                                                              buf, cb, id, sector, 
+                                                              qcow_idx, private);
+                               }
+                       }
+                       else aio_unlock(s, sector);
+               }
+               nb_sectors -= n;
+               sector += n;
+               buf += n * 512;
+       }
+       s->cluster_cache_offset = -1; /* disable compressed cache */
+
+done:
+       /*Callback if no async requests outstanding*/
+        if (!asubmit) return cb(bs, ret == -1 ? -1 : 0, id, private);
+
+       return 0;
+}
+               
+int tdqcow_submit(struct td_state *bs)
+{
+        int ret;
+        struct   tdqcow_state *prv = (struct tdqcow_state *)bs->private;
+
+        ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
+
+        /* XXX: TODO: Handle error conditions here. */
+
+        /* Success case: */
+        prv->iocb_queued = 0;
+
+        return ret;
+}
+
+
+int *tdqcow_get_fd(struct td_state *bs)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       int *fds, i;
+
+       fds = malloc(sizeof(int) * MAX_IOFD);
+       /*initialise the FD array*/
+       for(i=0;i<MAX_IOFD;i++) fds[i] = 0;
+
+       fds[0] = s->poll_fd;
+       return fds;
+}
+
+int tdqcow_close(struct td_state *bs)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       uint32_t cksum, out;
+       int fd, offset;
+
+       /*Update the hdr cksum*/
+       if(s->min_cluster_alloc == s->l2_size) {
+               cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
+               printf("Writing cksum: %d",cksum);
+               fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/
+               offset = sizeof(QCowHeader) + sizeof(uint32_t);
+               lseek(fd, offset, SEEK_SET);
+               out = cpu_to_be32(cksum);
+               write(fd, &out, sizeof(uint32_t));
+               close(fd);
+       }
+
+       free(s->name);
+       free(s->l1_table);
+       free(s->l2_cache);
+       free(s->cluster_cache);
+       free(s->cluster_data);
+       close(s->fd);   
+       return 0;
+}
+
+int tdqcow_do_callbacks(struct td_state *s, int sid)
+{
+        int ret, i, rsp = 0,*ptr;
+        struct io_event *ep;
+        struct tdqcow_state *prv = (struct tdqcow_state *)s->private;
+
+        if (sid > MAX_IOFD) return 1;
+       
+       /* Non-blocking test for completed io. */
+        ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events,
+                           NULL);
+
+        for (ep=prv->aio_events, i = ret; i-->0; ep++) {
+                struct iocb        *io  = ep->obj;
+                struct pending_aio *pio;
+
+                pio = &prv->pending_aio[(long)io->data];
+
+                if (ep->res != io->u.c.nbytes) {
+                        /* TODO: handle this case better. */
+                       ptr = (int *)&ep->res;
+                        DPRINTF("AIO did less than I asked it to "
+                               "[%lu,%lu,%d]\n", 
+                               ep->res, io->u.c.nbytes, *ptr);
+                }
+               aio_unlock(prv, pio->sector);
+               if (pio->id >= 0) {
+                       if (prv->crypt_method)
+                               encrypt_sectors(prv, pio->sector, 
+                                               (unsigned char *)pio->buf, 
+                                               (unsigned char *)pio->buf, 
+                                               pio->nb_sectors, 0, 
+                                               &prv->aes_decrypt_key);
+                       prv->nr_reqs[pio->qcow_idx]--;
+                       if (prv->nr_reqs[pio->qcow_idx] == 0) 
+                               rsp += pio->cb(s, ep->res2, pio->id, 
+                                              pio->private);
+               } else if (pio->id == -2) free(pio->buf);
+
+                prv->iocb_free[prv->iocb_free_count++] = io;
+        }
+        return rsp;
+}
+
+int qcow_create(const char *filename, uint64_t total_size,
+                      const char *backing_file, int flags)
+{
+       int fd, header_size, backing_filename_len, l1_size, i;
+       int shift, length, adjust, ret = 0;
+       QCowHeader header;
+       QCowHeader_ext exthdr;
+       char backing_filename[1024], *ptr;
+       uint64_t tmp, size;
+       struct stat st;
+
+       DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size);
+
+       fd = open(filename, 
+                 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 
+                 0644);
+       if (fd < 0)
+               return -1;
+
+       memset(&header, 0, sizeof(header));
+       header.magic = cpu_to_be32(QCOW_MAGIC);
+       header.version = cpu_to_be32(QCOW_VERSION);
+
+       /*Create extended header fields*/
+       exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
+
+       header_size = sizeof(header) + sizeof(QCowHeader_ext);
+       backing_filename_len = 0;
+       size = (total_size >> SECTOR_SHIFT);
+       if (backing_file) {
+               if (strcmp(backing_file, "fat:")) {
+                       const char *p;
+                       /* XXX: this is a hack: we do not attempt to 
+                        *check for URL like syntax */
+                       p = strchr(backing_file, ':');
+                       if (p && (p - backing_file) >= 2) {
+                               /* URL like but exclude "c:" like filenames */
+                               strncpy(backing_filename, backing_file,
+                                       sizeof(backing_filename));
+                       } else {
+                               realpath(backing_file, backing_filename);
+                               if (stat(backing_filename, &st) != 0) {
+                                       return -1;
+                               }
+                       }
+                       header.backing_file_offset = cpu_to_be64(header_size);
+                       backing_filename_len = strlen(backing_filename);
+                       header.backing_file_size = cpu_to_be32(
+                               backing_filename_len);
+                       header_size += backing_filename_len;
+                       
+                       /*Set to the backing file size*/
+                       size = (st.st_size >> SECTOR_SHIFT);
+                       DPRINTF("Backing file size detected: %lld sectors" 
+                               "(total %lld [%lld MB])\n", 
+                               (long long)total_size, 
+                               (long long)(total_size << SECTOR_SHIFT), 
+                               (long long)(total_size >> 11));
+               } else {
+                       backing_file = NULL;
+                       DPRINTF("Setting file size: %lld (total %lld)\n", 
+                               (long long) total_size, 
+                               (long long) (total_size << SECTOR_SHIFT));
+               }
+               header.mtime = cpu_to_be32(st.st_mtime);
+               header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+                                           unmodifyed sectors */
+               header.l2_bits = 12; /* 32 KB L2 tables */
+               exthdr.min_cluster_alloc = cpu_to_be32(1);
+       } else {
+               DPRINTF("Setting file size: %lld sectors" 
+                       "(total %lld [%lld MB])\n", 
+                       (long long) size, 
+                       (long long) (size << SECTOR_SHIFT), 
+                       (long long) (size >> 11));
+               header.cluster_bits = 12; /* 4 KB clusters */
+               header.l2_bits = 9; /* 4 KB L2 tables */
+               exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
+       }
+       /*Set the header size value*/
+       header.size = cpu_to_be64(size * 512);
+       
+       header_size = (header_size + 7) & ~7;
+       if (header_size % 4096 > 0) {
+               header_size = ((header_size >> 12) + 1) << 12;
+       }
+
+       shift = header.cluster_bits + header.l2_bits;
+       l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
+
+       header.l1_table_offset = cpu_to_be64(header_size);
+       DPRINTF("L1 Table offset: %d, size %d\n",
+               header_size,
+               (int)(l1_size * sizeof(uint64_t)));
+       if (flags) {
+               header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+       } else {
+               header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+       }
+
+       ptr = calloc(1, l1_size * sizeof(uint64_t));
+       exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
+       printf("Created cksum: %d\n",exthdr.cksum);
+       free(ptr);
+       
+       /* write all the data */
+       ret += write(fd, &header, sizeof(header));
+       ret += write(fd, &exthdr, sizeof(exthdr));
+       if (backing_file) {
+               ret += write(fd, backing_filename, backing_filename_len);
+       }
+       lseek(fd, header_size, SEEK_SET);
+       tmp = 0;
+       for (i = 0;i < l1_size; i++) {
+               ret += write(fd, &tmp, sizeof(tmp));
+       }
+
+       /*adjust file length to 4 KByte boundary*/
+       length = header_size + l1_size * sizeof(uint64_t);
+       if (length % 4096 > 0) {
+               length = ((length >> 12) + 1) << 12;
+               ftruncate(fd, length);
+               DPRINTF("Adjusted filelength to %d for 4 "
+                       "Kbyte alignment\n",length);
+       }
+
+       close(fd);
+
+       return 0;
+}
+
+int qcow_make_empty(struct td_state *bs)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+
+       memset(s->l1_table, 0, l1_length);
+       lseek(s->fd, s->l1_table_offset, SEEK_SET);
+       if (write(s->fd, s->l1_table, l1_length) < 0)
+               return -1;
+       ftruncate(s->fd, s->l1_table_offset + l1_length);
+
+       memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+       memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+       memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+       return 0;
+}
+
+int qcow_get_cluster_size(struct td_state *bs)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+
+       return s->cluster_size;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+int qcow_compress_cluster(struct td_state *bs, int64_t sector_num, 
+                          const uint8_t *buf)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
+       z_stream strm;
+       int ret, out_len;
+       uint8_t *out_buf;
+       uint64_t cluster_offset;
+
+       out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+       if (!out_buf)
+               return -1;
+
+       /* best compression, small window, no zlib header */
+       memset(&strm, 0, sizeof(strm));
+       ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                          Z_DEFLATED, -12, 
+                          9, Z_DEFAULT_STRATEGY);
+       if (ret != 0) {
+               free(out_buf);
+               return -1;
+       }
+
+       strm.avail_in = s->cluster_size;
+       strm.next_in = (uint8_t *)buf;
+       strm.avail_out = s->cluster_size;
+       strm.next_out = out_buf;
+
+       ret = deflate(&strm, Z_FINISH);
+       if (ret != Z_STREAM_END && ret != Z_OK) {
+               free(out_buf);
+               deflateEnd(&strm);
+               return -1;
+       }
+       out_len = strm.next_out - out_buf;
+
+       deflateEnd(&strm);
+
+       if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+               /* could not compress: write normal cluster */
+               //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
+       } else {
+               cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, 
+                                            out_len, 0, 0);
+               cluster_offset &= s->cluster_offset_mask;
+               lseek(s->fd, cluster_offset, SEEK_SET);
+               if (write(s->fd, out_buf, out_len) != out_len) {
+                       free(out_buf);
+                       return -1;
+               }
+       }
+       
+       free(out_buf);
+       return 0;
+}
+
+struct tap_disk tapdisk_qcow = {
+       "tapdisk_qcow",
+       sizeof(struct tdqcow_state),
+       tdqcow_open,
+       tdqcow_queue_read,
+       tdqcow_queue_write,
+       tdqcow_submit,
+       tdqcow_get_fd,
+       tdqcow_close,
+       tdqcow_do_callbacks,
+};
+
diff --git a/tools/blktap/drivers/block-ram.c b/tools/blktap/drivers/block-ram.c
new file mode 100644 (file)
index 0000000..4c378ed
--- /dev/null
@@ -0,0 +1,296 @@
+/* block-ram.c
+ *
+ * Fast Ramdisk implementation.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include "tapdisk.h"
+
+#define MAX_DISK_SIZE 1024000 /*500MB disk limit*/
+
+char *img;
+long int   disksector_size;
+long int   disksize;
+long int   diskinfo;
+static int connections = 0;
+
+struct tdram_state {
+        int fd;
+       int poll_pipe[2]; /* dummy fd for polling on */
+};
+
+/*Get Image size, secsize*/
+static int get_image_info(struct td_state *s, int fd)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               s->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&s->size)!=0) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(s->size << SECTOR_SHIFT),
+                       (long long unsigned)s->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       s->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &s->sector_size);
+                       
+                       if (s->sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %ld (not %d)\n",
+                                       s->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               s->size = (stat.st_size >> SECTOR_SHIFT);
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(s->size << SECTOR_SHIFT),
+                       (long long unsigned)s->size);
+       }
+
+       if (s->size == 0) {             
+               s->size =((uint64_t) MAX_DISK_SIZE);
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       s->info = 0;
+
+        /*Store variables locally*/
+       disksector_size = s->sector_size;
+       disksize        = s->size;
+       diskinfo        = s->info;
+       DPRINTF("Image sector_size: \n\t[%lu]\n",
+               s->sector_size);
+
+       return 0;
+}
+
+/* Open the disk file and initialize ram state. */
+int tdram_open (struct td_state *s, const char *name)
+{
+       int i, fd, ret = 0, count = 0;
+       struct tdram_state *prv = (struct tdram_state *)s->private;
+       uint64_t size;
+       char *p;
+       s->private = prv;
+
+       connections++;
+       
+       /* set up a pipe so that we can hand back a poll fd that won't fire.*/
+       ret = pipe(prv->poll_pipe);
+       if (ret != 0)
+               return (0 - errno);
+
+       if (connections > 1) {
+               s->sector_size = disksector_size;
+               s->size        = disksize;
+               s->info        = diskinfo; 
+               DPRINTF("Image already open, returning parameters:\n");
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(s->size << SECTOR_SHIFT),
+                       (long long unsigned)s->size);
+               DPRINTF("Image sector_size: \n\t[%lu]\n",
+                       s->sector_size);
+
+               prv->fd = -1;
+               goto done;
+       }
+
+       /* Open the file */
+        fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE);
+
+        if ((fd == -1) && (errno == EINVAL)) {
+
+                /* Maybe O_DIRECT isn't supported. */
+                fd = open(name, O_RDWR | O_LARGEFILE);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s]!\n",name);
+               ret = 0 - errno;
+               goto done;
+        }
+
+        prv->fd = fd;
+
+       ret = get_image_info(s, fd);
+       size = MAX_DISK_SIZE;
+
+       if (s->size > size) {
+               DPRINTF("Disk exceeds limit, must be less than [%d]MB",
+                       (MAX_DISK_SIZE<<SECTOR_SHIFT)>>20);
+               return -ENOMEM;
+       }
+
+       /*Read the image into memory*/
+       p = img = malloc(s->size << SECTOR_SHIFT);
+       if (img == NULL) {
+               DPRINTF("Mem malloc failed\n");
+               return -1;
+       }
+       DPRINTF("Reading %llu bytes.......",(long long unsigned)s->size << SECTOR_SHIFT);
+
+       for (i = 0; i < s->size; i++) {
+               ret = read(prv->fd, p, s->sector_size);
+               if (ret != s->sector_size) {
+                       ret = 0 - errno;
+                       break;
+               } else {
+                       count += ret;
+                       p = img + count;
+               }
+       }
+       DPRINTF("[%d]\n",count);
+       if (count != s->size << SECTOR_SHIFT) {
+               ret = -1;
+       } else {
+               ret = 0;
+       } 
+
+done:
+       return ret;
+}
+
+ int tdram_queue_read(struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct tdram_state *prv = (struct tdram_state *)s->private;
+       int      size    = nb_sectors * s->sector_size;
+       uint64_t offset  = sector * (uint64_t)s->sector_size;
+       int ret;
+
+       memcpy(buf, img + offset, size);
+       ret = size;
+
+       cb(s, (ret < 0) ? ret: 0, id, private);
+
+       return ret;
+}
+
+ int tdram_queue_write(struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct tdram_state *prv = (struct tdram_state *)s->private;
+       int      size    = nb_sectors * s->sector_size;
+       uint64_t offset  = sector * (uint64_t)s->sector_size;
+       int ret;
+       
+       /*We assume that write access is controlled at a higher level for multiple disks*/
+       memcpy(img + offset, buf, size);
+       ret = size;
+
+       cb(s, (ret < 0) ? ret : 0, id, private);
+
+       return ret;
+}
+               
+int tdram_submit(struct td_state *s)
+{
+       return 0;       
+}
+
+
+int *tdram_get_fd(struct td_state *s)
+{
+       struct tdram_state *prv = (struct tdram_state *)s->private;
+        int *fds, i;
+
+        fds = malloc(sizeof(int) * MAX_IOFD);
+        /*initialise the FD array*/
+        for(i=0;i<MAX_IOFD;i++) fds[i] = 0;
+
+        fds[0] = prv->poll_pipe[0];
+        return fds;    
+}
+
+int tdram_close(struct td_state *s)
+{
+       struct tdram_state *prv = (struct tdram_state *)s->private;
+       
+       connections--;
+       
+       return 0;
+}
+
+int tdram_do_callbacks(struct td_state *s, int sid)
+{
+       /* always ask for a kick */
+       return 1;
+}
+
+struct tap_disk tapdisk_ram = {
+       "tapdisk_ram",
+       sizeof(struct tdram_state),
+       tdram_open,
+       tdram_queue_read,
+       tdram_queue_write,
+       tdram_submit,
+       tdram_get_fd,
+       tdram_close,
+       tdram_do_callbacks,
+};
+
diff --git a/tools/blktap/drivers/block-sync.c b/tools/blktap/drivers/block-sync.c
new file mode 100644 (file)
index 0000000..77865cc
--- /dev/null
@@ -0,0 +1,242 @@
+/* block-sync.c
+ *
+ * simple slow synchronous raw disk implementation.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include "tapdisk.h"
+
+struct tdsync_state {
+       int fd;
+       int poll_pipe[2]; /* dummy fd for polling on */
+};
+       
+/*Get Image size, secsize*/
+static int get_image_info(struct td_state *s, int fd)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               s->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&s->size)!=0) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(s->size << SECTOR_SHIFT),
+                       (long long unsigned)s->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       s->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &s->sector_size);
+                       
+                       if (s->sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %ld (not %d)\n",
+                                       s->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               s->size = (stat.st_size >> SECTOR_SHIFT);
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%lluu]\n\tpost "
+                       "sector_shift [%lluu]\n",
+                       (long long unsigned)(s->size << SECTOR_SHIFT),
+                       (long long unsigned)s->size);
+       }
+
+       if (s->size == 0)
+               return -EINVAL;
+
+       s->info = 0;
+
+       return 0;
+}
+
+/* Open the disk file and initialize aio state. */
+int tdsync_open (struct td_state *s, const char *name)
+{
+       int i, fd, ret = 0;
+       struct tdsync_state *prv = (struct tdsync_state *)s->private;
+       s->private = prv;
+       
+       /* set up a pipe so that we can hand back a poll fd that won't fire.*/
+       ret = pipe(prv->poll_pipe);
+       if (ret != 0)
+               return (0 - errno);
+       
+       /* Open the file */
+        fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE);
+
+        if ( (fd == -1) && (errno == EINVAL) ) {
+
+                /* Maybe O_DIRECT isn't supported. */
+                fd = open(name, O_RDWR | O_LARGEFILE);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s]!\n",name);
+               ret = 0 - errno;
+               goto done;
+        }
+
+        prv->fd = fd;
+
+       ret = get_image_info(s, fd);
+done:
+       return ret;     
+}
+
+ int tdsync_queue_read(struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct tdsync_state *prv = (struct tdsync_state *)s->private;
+       int      size    = nb_sectors * s->sector_size;
+       uint64_t offset  = sector * (uint64_t)s->sector_size;
+       int ret;
+       
+       ret = lseek(prv->fd, offset, SEEK_SET);
+       if (ret != (off_t)-1) {
+               ret = read(prv->fd, buf, size);
+               if (ret != size) {
+                       ret = 0 - errno;
+               } else {
+                       ret = 1;
+               } 
+       } else ret = 0 - errno;
+               
+       cb(s, (ret < 0) ? ret: 0, id, private);
+       
+       return 1;
+}
+
+ int tdsync_queue_write(struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct tdsync_state *prv = (struct tdsync_state *)s->private;
+       int      size    = nb_sectors * s->sector_size;
+       uint64_t offset  = sector * (uint64_t)s->sector_size;
+       int ret = 0;
+       
+       ret = lseek(prv->fd, offset, SEEK_SET);
+       if (ret != (off_t)-1) {
+               ret = write(prv->fd, buf, size);
+               if (ret != size) {
+                       ret = 0 - errno;
+               } else {
+                       ret = 1;
+               }
+       } else ret = 0 - errno;
+               
+       cb(s, (ret < 0) ? ret : 0, id, private);
+       
+       return 1;
+}
+               
+int tdsync_submit(struct td_state *s)
+{
+       return 0;       
+}
+
+
+int *tdsync_get_fd(struct td_state *s)
+{
+       struct tdsync_state *prv = (struct tdsync_state *)s->private;
+       
+       int *fds, i;
+
+       fds = malloc(sizeof(int) * MAX_IOFD);
+       /*initialise the FD array*/
+       for(i=0;i<MAX_IOFD;i++) fds[i] = 0;
+
+       fds[0] = prv->poll_pipe[0];
+       return fds;
+}
+
+int tdsync_close(struct td_state *s)
+{
+       struct tdsync_state *prv = (struct tdsync_state *)s->private;
+       
+       close(prv->fd);
+       close(prv->poll_pipe[0]);
+       close(prv->poll_pipe[1]);
+       
+       return 0;
+}
+
+int tdsync_do_callbacks(struct td_state *s, int sid)
+{
+       /* always ask for a kick */
+       return 1;
+}
+
+struct tap_disk tapdisk_sync = {
+       "tapdisk_sync",
+       sizeof(struct tdsync_state),
+       tdsync_open,
+       tdsync_queue_read,
+       tdsync_queue_write,
+       tdsync_submit,
+       tdsync_get_fd,
+       tdsync_close,
+       tdsync_do_callbacks,
+};
+
diff --git a/tools/blktap/drivers/block-vmdk.c b/tools/blktap/drivers/block-vmdk.c
new file mode 100644 (file)
index 0000000..437cd5c
--- /dev/null
@@ -0,0 +1,415 @@
+/* block-vmdk.c
+ *
+ * VMware Disk format implementation.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * This is largely the same as the vmdk driver in Qemu, I've just twisted it
+ * to match our interfaces.  The original (BSDish) Copyright message appears 
+ * below:
+ */
+/*
+ * Block driver for the VMDK format
+ * 
+ * Copyright (c) 2004 Fabrice Bellard
+ * Copyright (c) 2005 Filip Navara
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include "tapdisk.h"
+#include "bswap.h"
+
+#define safer_free(_x)       \
+  do {                       \
+       if (NULL != _x) {    \
+               free(_x);    \
+               (_x) = NULL; \
+       }                    \
+  } while (0) ;
+
+#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
+#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
+
+typedef struct {
+    uint32_t version;
+    uint32_t flags;
+    uint32_t disk_sectors;
+    uint32_t granularity;
+    uint32_t l1dir_offset;
+    uint32_t l1dir_size;
+    uint32_t file_sectors;
+    uint32_t cylinders;
+    uint32_t heads;
+    uint32_t sectors_per_track;
+} VMDK3Header;
+
+typedef struct {
+    uint32_t version;
+    uint32_t flags;
+    int64_t capacity;
+    int64_t granularity;
+    int64_t desc_offset;
+    int64_t desc_size;
+    int32_t num_gtes_per_gte;
+    int64_t rgd_offset;
+    int64_t gd_offset;
+    int64_t grain_offset;
+    char filler[1];
+    char check_bytes[4];
+} __attribute__((packed)) VMDK4Header;
+
+#define L2_CACHE_SIZE 16
+
+struct tdvmdk_state {
+        int fd;
+       int poll_pipe[2]; /* dummy fd for polling on */
+       
+       unsigned int l1_size;
+       int64_t l1_table_offset;
+       int64_t l1_backup_table_offset;
+       uint32_t l1_entry_sectors;
+       unsigned int l2_size;
+       
+       uint32_t *l1_table;
+       uint32_t *l1_backup_table;
+       uint32_t *l2_cache;
+       uint32_t l2_cache_offsets[L2_CACHE_SIZE];
+       uint32_t l2_cache_counts[L2_CACHE_SIZE];
+       
+       unsigned int cluster_sectors;
+};
+
+
+/* Open the disk file and initialize aio state. */
+static int tdvmdk_open (struct td_state *s, const char *name)
+{
+       int ret, fd;
+       int l1_size, i;
+       uint32_t magic;
+       struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private;
+
+       /* set up a pipe so that we can hand back a poll fd that won't fire.*/
+       ret = pipe(prv->poll_pipe);
+       if (ret != 0)
+               return -1;
+       
+       /* Open the file */
+        fd = open(name, O_RDWR | O_LARGEFILE); 
+
+        if ( (fd == -1) && (errno == EINVAL) ) {
+
+                /* Maybe O_DIRECT isn't supported. */
+                fd = open(name, O_RDWR | O_LARGEFILE);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s]!\n",name);
+               ret = 0 - errno;
+               return -1;
+        }
+        
+        prv->fd = fd;
+        
+        /* Grok the vmdk header. */
+       if ((ret = read(fd, &magic, sizeof(magic))) != sizeof(magic))
+               goto fail;
+       magic = be32_to_cpu(magic);
+       if (magic == VMDK3_MAGIC) {
+               VMDK3Header header;
+               if (read(fd, &header, sizeof(header)) != 
+                       sizeof(header)) 
+                       goto fail;
+               prv->cluster_sectors = le32_to_cpu(header.granularity);
+               prv->l2_size = 1 << 9;
+               prv->l1_size = 1 << 6;
+               s->size = le32_to_cpu(header.disk_sectors);
+               prv->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
+               prv->l1_backup_table_offset = 0;
+               prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors;
+       } else if (magic == VMDK4_MAGIC) {
+               VMDK4Header header;
+        
+               if (read(fd, &header, sizeof(header)) != sizeof(header))
+                       goto fail;
+               s->size = le32_to_cpu(header.capacity);
+               prv->cluster_sectors = le32_to_cpu(header.granularity);
+               prv->l2_size = le32_to_cpu(header.num_gtes_per_gte);
+               prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors;
+               if (prv->l1_entry_sectors <= 0)
+                       goto fail;
+               prv->l1_size = (s->size + prv->l1_entry_sectors - 1) 
+                              / prv->l1_entry_sectors;
+               prv->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
+               prv->l1_backup_table_offset = 
+                       le64_to_cpu(header.gd_offset) << 9;
+       } else {
+               goto fail;
+       }
+       /* read the L1 table */
+       l1_size = prv->l1_size * sizeof(uint32_t);
+       prv->l1_table = malloc(l1_size);
+       if (!prv->l1_table)
+               goto fail;
+       if (lseek(fd, prv->l1_table_offset, SEEK_SET) == -1)
+               goto fail;
+       if (read(fd, prv->l1_table, l1_size) != l1_size)
+               goto fail;
+       for (i = 0; i < prv->l1_size; i++) {
+               le32_to_cpus(&prv->l1_table[i]);
+       }
+
+       if (prv->l1_backup_table_offset) {
+               prv->l1_backup_table = malloc(l1_size);
+               if (!prv->l1_backup_table)
+                       goto fail;
+               if (lseek(fd, prv->l1_backup_table_offset, SEEK_SET) == -1)
+                       goto fail;
+               if (read(fd, prv->l1_backup_table, l1_size) != l1_size)
+                       goto fail;
+               for(i = 0; i < prv->l1_size; i++) {
+                       le32_to_cpus(&prv->l1_backup_table[i]);
+               }
+       }
+
+       prv->l2_cache = malloc(prv->l2_size * L2_CACHE_SIZE *sizeof(uint32_t));
+       if (!prv->l2_cache)
+               goto fail;
+       prv->fd = fd;
+       DPRINTF("VMDK File opened successfully\n");
+       return 0;
+       
+fail:
+       DPRINTF("VMDK File open failed.\n"); 
+       safer_free(prv->l1_backup_table);
+       free(prv->l1_table);
+       free(prv->l2_cache);
+       close(fd);
+       return -1;
+}
+
+static uint64_t get_cluster_offset(struct td_state *s, 
+                                   uint64_t offset, int allocate)
+{
+       struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private;
+       unsigned int l1_index, l2_offset, l2_index;
+       int min_index, i, j;
+       uint32_t min_count, *l2_table, tmp;
+       uint64_t cluster_offset;
+    
+       l1_index = (offset >> 9) / prv->l1_entry_sectors;
+       if (l1_index >= prv->l1_size)
+               return 0;
+       l2_offset = prv->l1_table[l1_index];
+       if (!l2_offset)
+               return 0;
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (l2_offset == prv->l2_cache_offsets[i]) {
+                       /* increment the hit count */
+                       if (++prv->l2_cache_counts[i] == 0xffffffff) {
+                               for(j = 0; j < L2_CACHE_SIZE; j++) {
+                                       prv->l2_cache_counts[j] >>= 1;
+                               }
+                       }
+                       l2_table = prv->l2_cache + (i * prv->l2_size);
+                       goto found;
+               }
+       }
+       /* not found: load a new entry in the least used one */
+       min_index = 0;
+       min_count = 0xffffffff;
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (prv->l2_cache_counts[i] < min_count) {
+                       min_count = prv->l2_cache_counts[i];
+                       min_index = i;
+               }
+       }
+       l2_table = prv->l2_cache + (min_index * prv->l2_size);
+       lseek(prv->fd, (int64_t)l2_offset * 512, SEEK_SET);
+       if (read(prv->fd, l2_table, prv->l2_size * sizeof(uint32_t)) != 
+                prv->l2_size * sizeof(uint32_t))
+               return 0;
+       prv->l2_cache_offsets[min_index] = l2_offset;
+       prv->l2_cache_counts[min_index] = 1;
+ found:
+       l2_index = ((offset >> 9) / prv->cluster_sectors) % prv->l2_size;
+       cluster_offset = le32_to_cpu(l2_table[l2_index]);
+       if (!cluster_offset) {
+               if (!allocate)
+                       return 0;
+               cluster_offset = lseek(prv->fd, 0, SEEK_END);
+               ftruncate(prv->fd, cluster_offset + 
+                         (prv->cluster_sectors << 9));
+               cluster_offset >>= 9;
+               /* update L2 table */
+               tmp = cpu_to_le32(cluster_offset);
+               l2_table[l2_index] = tmp;
+               lseek(prv->fd, ((int64_t)l2_offset * 512) + 
+                     (l2_index * sizeof(tmp)), SEEK_SET);
+               if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp))
+                       return 0;
+               /* update backup L2 table */
+               if (prv->l1_backup_table_offset != 0) {
+                       l2_offset = prv->l1_backup_table[l1_index];
+               lseek(prv->fd, ((int64_t)l2_offset * 512) + 
+                       (l2_index * sizeof(tmp)), SEEK_SET);
+               if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp))
+                       return 0;
+               }
+       }
+       cluster_offset <<= 9;
+       return cluster_offset;
+}
+
+static int tdvmdk_queue_read(struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private;
+       int index_in_cluster, n;
+       uint64_t cluster_offset;
+       int ret = 0;
+       while (nb_sectors > 0) {
+               cluster_offset = get_cluster_offset(s, sector << 9, 0);
+               index_in_cluster = sector % prv->cluster_sectors;
+               n = prv->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+               if (!cluster_offset) {
+                       memset(buf, 0, 512 * n);
+               } else {
+                       lseek(prv->fd, cluster_offset + index_in_cluster * 512,
+                             SEEK_SET);
+                       ret = read(prv->fd, buf, n * 512);
+                       if (ret != n * 512) {
+                               ret = -1;
+                               goto done;
+                       }
+               }
+               nb_sectors -= n;
+               sector     += n;
+               buf += n * 512;
+       }
+done:
+       cb(s, ret == -1 ? -1 : 0, id, private);
+       
+       return 1;
+}
+
+static  int tdvmdk_queue_write(struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *private)
+{
+       struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private;
+       int index_in_cluster, n;
+       uint64_t cluster_offset;
+       int ret = 0;
+       
+
+       while (nb_sectors > 0) {
+               index_in_cluster = sector & (prv->cluster_sectors - 1);
+               n = prv->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+               cluster_offset = get_cluster_offset(s, sector << 9, 1);
+               if (!cluster_offset) {
+                       ret = -1;
+                       goto done;
+               }
+               lseek(prv->fd, cluster_offset + index_in_cluster * 512, 
+                     SEEK_SET);
+               ret = write(prv->fd, buf, n * 512);
+               if (ret != n * 512) {
+                       ret = -1;
+                       goto done;
+               }
+               nb_sectors -= n;
+               sector     += n;
+               buf += n * 512;
+       }
+done:
+       cb(s, ret == -1 ? -1 : 0, id, private);
+       
+       return 1;
+}
+               
+static int tdvmdk_submit(struct td_state *s)
+{
+       return 0;       
+}
+
+
+static int *tdvmdk_get_fd(struct td_state *s)
+{
+       struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private;
+        int *fds, i;
+
+        fds = malloc(sizeof(int) * MAX_IOFD);
+        /*initialise the FD array*/
+        for (i=0;i<MAX_IOFD;i++) fds[i] = 0;
+
+        fds[0] = prv->poll_pipe[0];
+        return fds;
+}
+
+static int tdvmdk_close(struct td_state *s)
+{
+       struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private;
+       
+       safer_free(prv->l1_table);
+       safer_free(prv->l1_backup_table);
+       safer_free(prv->l2_cache);
+       close(prv->fd);
+       close(prv->poll_pipe[0]);
+       close(prv->poll_pipe[1]);
+       return 0;
+}
+
+static int tdvmdk_do_callbacks(struct td_state *s, int sid)
+{
+       /* always ask for a kick */
+       return 1;
+}
+
+struct tap_disk tapdisk_vmdk = {
+       "tapdisk_vmdk",
+       sizeof(struct tdvmdk_state),
+       tdvmdk_open,
+       tdvmdk_queue_read,
+       tdvmdk_queue_write,
+       tdvmdk_submit,
+       tdvmdk_get_fd,
+       tdvmdk_close,
+       tdvmdk_do_callbacks,
+};
+
diff --git a/tools/blktap/drivers/bswap.h b/tools/blktap/drivers/bswap.h
new file mode 100644 (file)
index 0000000..bb9de92
--- /dev/null
@@ -0,0 +1,202 @@
+#ifndef BSWAP_H
+#define BSWAP_H
+
+//#include "config-host.h"
+
+#include <inttypes.h>
+
+#ifdef HAVE_BYTESWAP_H
+#include <byteswap.h>
+#else
+
+#define bswap_16(x) \
+({ \
+       uint16_t __x = (x); \
+       ((uint16_t)( \
+               (((uint16_t)(__x) & (uint16_t)0x00ffU) << 8) | \
+               (((uint16_t)(__x) & (uint16_t)0xff00U) >> 8) )); \
+})
+
+#define bswap_32(x) \
+({ \
+       uint32_t __x = (x); \
+       ((uint32_t)( \
+               (((uint32_t)(__x) & (uint32_t)0x000000ffUL) << 24) | \
+               (((uint32_t)(__x) & (uint32_t)0x0000ff00UL) <<  8) | \
+               (((uint32_t)(__x) & (uint32_t)0x00ff0000UL) >>  8) | \
+               (((uint32_t)(__x) & (uint32_t)0xff000000UL) >> 24) )); \
+})
+
+#define bswap_64(x) \
+({ \
+       uint64_t __x = (x); \
+       ((uint64_t)( \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000000000ffULL) << 56) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000000000ff00ULL) << 40) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000000000ff0000ULL) << 24) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000ff000000ULL) <<  8) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000ff00000000ULL) >>  8) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000ff0000000000ULL) >> 24) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00ff000000000000ULL) >> 40) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0xff00000000000000ULL) >> 56) )); \
+})
+
+#endif /* !HAVE_BYTESWAP_H */
+
+static inline uint16_t bswap16(uint16_t x)
+{
+    return bswap_16(x);
+}
+
+static inline uint32_t bswap32(uint32_t x) 
+{
+    return bswap_32(x);
+}
+
+static inline uint64_t bswap64(uint64_t x) 
+{
+    return bswap_64(x);
+}
+
+static inline void bswap16s(uint16_t *s)
+{
+    *s = bswap16(*s);
+}
+
+static inline void bswap32s(uint32_t *s)
+{
+    *s = bswap32(*s);
+}
+
+static inline void bswap64s(uint64_t *s)
+{
+    *s = bswap64(*s);
+}
+
+#if defined(WORDS_BIGENDIAN)
+#define be_bswap(v, size) (v)
+#define le_bswap(v, size) bswap ## size(v)
+#define be_bswaps(v, size)
+#define le_bswaps(p, size) *p = bswap ## size(*p);
+#else
+#define le_bswap(v, size) (v)
+#define be_bswap(v, size) bswap ## size(v)
+#define le_bswaps(v, size)
+#define be_bswaps(p, size) *p = bswap ## size(*p);
+#endif
+
+#define CPU_CONVERT(endian, size, type)\
+static inline type endian ## size ## _to_cpu(type v)\
+{\
+    return endian ## _bswap(v, size);\
+}\
+\
+static inline type cpu_to_ ## endian ## size(type v)\
+{\
+    return endian ## _bswap(v, size);\
+}\
+\
+static inline void endian ## size ## _to_cpus(type *p)\
+{\
+    endian ## _bswaps(p, size)\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## s(type *p)\
+{\
+    endian ## _bswaps(p, size)\
+}\
+\
+static inline type endian ## size ## _to_cpup(const type *p)\
+{\
+    return endian ## size ## _to_cpu(*p);\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\
+{\
+     *p = cpu_to_ ## endian ## size(v);\
+}
+
+CPU_CONVERT(be, 16, uint16_t)
+CPU_CONVERT(be, 32, uint32_t)
+CPU_CONVERT(be, 64, uint64_t)
+
+CPU_CONVERT(le, 16, uint16_t)
+CPU_CONVERT(le, 32, uint32_t)
+CPU_CONVERT(le, 64, uint64_t)
+
+/* unaligned versions (optimized for frequent unaligned accesses)*/
+
+#if defined(__i386__) || defined(__powerpc__)
+
+#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
+#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
+#define le16_to_cpupu(p) le16_to_cpup(p)
+#define le32_to_cpupu(p) le32_to_cpup(p)
+
+#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
+#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v)
+
+#else
+
+static inline void cpu_to_le16wu(uint16_t *p, uint16_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v;
+    p1[1] = v >> 8;
+}
+
+static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v;
+    p1[1] = v >> 8;
+    p1[2] = v >> 16;
+    p1[3] = v >> 24;
+}
+
+static inline uint16_t le16_to_cpupu(const uint16_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8);
+}
+
+static inline uint32_t le32_to_cpupu(const uint32_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
+}
+
+static inline void cpu_to_be16wu(uint16_t *p, uint16_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v >> 8;
+    p1[1] = v;
+}
+
+static inline void cpu_to_be32wu(uint32_t *p, uint32_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v >> 24;
+    p1[1] = v >> 16;
+    p1[2] = v >> 8;
+    p1[3] = v;
+}
+
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define cpu_to_32wu cpu_to_be32wu
+#else
+#define cpu_to_32wu cpu_to_le32wu
+#endif
+
+#undef le_bswap
+#undef be_bswap
+#undef le_bswaps
+#undef be_bswaps
+
+#endif /* BSWAP_H */
diff --git a/tools/blktap/drivers/img2qcow.c b/tools/blktap/drivers/img2qcow.c
new file mode 100644 (file)
index 0000000..2c9974c
--- /dev/null
@@ -0,0 +1,289 @@
+/* img2qcow.c
+ *
+ * Generates a qcow format disk and fills it from an existing image.
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include "tapdisk.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define TAPDISK 1
+#define BLOCK_PROCESSSZ 4096
+
+static int maxfds, *io_fd, running = 1, complete = 0;
+static int returned_events = 0, submit_events = 0;
+static uint64_t prev = 0;
+static char output[25];
+
+void print_bytes(void *ptr, int length) {
+
+  int i,k;
+  unsigned char *p = ptr;
+
+    DFPRINTF("Buf dump, length %d:\n",length);
+    for (k = 0; k < length; k++) {
+        DFPRINTF("%x",*p);
+        *p++;
+       if(k % 16 == 0) DFPRINTF("\n");
+        else if(k % 2 == 0) DFPRINTF(" ");     
+    }
+    DFPRINTF("\n");
+    return;
+}
+
+void debug_output(uint64_t progress, uint64_t size)
+{
+       uint64_t blocks = size/20;
+
+       /*Output progress every 5% */   
+       if (progress/blocks > prev) {
+               memcpy(output+prev+1,"=>",2);
+               prev++;
+               DFPRINTF("\r%s     %llu%%", output, 
+                       (long long)(prev-1)*5);
+       }
+       return;
+}
+
+static inline void LOCAL_FD_SET(fd_set *readfds) 
+{
+       FD_SET(io_fd[0], readfds);
+       maxfds = io_fd[0] + 1;
+       
+       return;
+}
+
+static int get_image_info(struct td_state *s, int fd)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DFPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               s->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&s->size)!=0) {
+                       DFPRINTF("ERR: BLKGETSIZE failed, "
+                                "couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DFPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(s->size << SECTOR_SHIFT),
+                       (long long unsigned)s->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       s->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &s->sector_size);
+                       
+                       if (s->sector_size != DEFAULT_SECTOR_SIZE)
+                               DFPRINTF("Note: sector size is %ld (not %d)\n",
+                                       s->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               s->size = (stat.st_size >> SECTOR_SHIFT);
+               s->sector_size = DEFAULT_SECTOR_SIZE;
+               DFPRINTF("Image size: [%llu]\n",
+                       (long long unsigned)s->size);
+       }
+
+       return 0;
+}
+
+static int send_responses(struct td_state *s, int res, int idx, void *private)
+{
+       if (res < 0) DFPRINTF("AIO FAILURE: res [%d]!\n",res);
+       
+       returned_events++;
+       
+       free(private);
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       struct tap_disk *drv;
+       struct td_state *s;
+       int ret = -1, fd, len;
+       fd_set readfds;
+       struct timeval timeout;
+       uint64_t i;
+       char *buf;
+
+       if (argc != 3) {
+               fprintf(stderr, "Qcow-utils: v1.0.0\n");
+               fprintf(stderr, "usage: %s <QCOW FILENAME> <SRC IMAGE>\n", 
+                       argv[0]);
+               exit(-1);
+       }
+
+       s = malloc(sizeof(struct td_state));
+       
+       /*Open image*/
+       fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+       
+        if (fd == -1) {
+                DFPRINTF("Unable to open [%s], (err %d)!\n",argv[2],0 - errno);
+                exit(-1);
+        }
+       
+       get_image_info(s, fd);
+       
+       /*Create qcow file*/
+       ret = qcow_create(argv[1],s->size<<SECTOR_SHIFT,NULL,0);
+       
+       if (ret < 0) {
+               DFPRINTF("Unable to create QCOW file\n");
+               exit(-1);
+       } else DFPRINTF("Qcow file created: size %llu sectors\n",
+                       (long long unsigned)s->size);
+       
+       drv = &tapdisk_qcow;
+       s->private = malloc(drv->private_data_size);
+
+        /*Open qcow file*/
+        if (drv->td_open(s, argv[1])!=0) {
+               DFPRINTF("Unable to open Qcow file [%s]\n",argv[1]);
+               exit(-1);
+       }
+
+       io_fd = drv->td_get_fd(s);
+
+       /*Initialise the output string*/
+       memset(output,0x20,25);
+       output[0] = '[';
+       output[22] = ']';
+       output[23] = '\0';
+       DFPRINTF("%s",output);
+
+       i = 0;
+       while (running) {
+               timeout.tv_sec = 0;
+               
+               if (!complete) {
+                       /*Read sector from image*/
+                       if (lseek(fd, i, SEEK_SET) == (off_t)-1) {
+                               DFPRINTF("Unable to access file offset %llu\n",
+                                      (long long)i);
+                               exit(-1);
+                       }
+                       
+                       if( (ret = posix_memalign((void **)&buf, 
+                                                 BLOCK_PROCESSSZ, 
+                                                 BLOCK_PROCESSSZ)) != 0) {
+                               DFPRINTF("Unable to read memalign buf (%d)\n",ret);
+                               exit(-1);                               
+                       }
+               
+                       /*We attempt to read 4k sized blocks*/
+                       len = read(fd, buf, BLOCK_PROCESSSZ);
+                       if (len < 512) {
+                               DFPRINTF("Unable to read sector %llu\n",
+                                      (long long unsigned) (i >> 9));
+                               complete = 1;
+                               continue;
+                       }
+                       
+                       if (len % 512) {
+                               len = (len >> 9) << 9;
+                       }
+
+                       ret = drv->td_queue_write(s, i >> 9,
+                                                 len >> 9, buf, 
+                                                 send_responses, 0, buf);
+                               
+                       if (!ret) submit_events++;
+                               
+                       if (ret < 0) {
+                               DFPRINTF("UNABLE TO WRITE block [%llu]\n",
+                                      (long long unsigned) (i >> 9));
+                       } else i += len;
+                       
+                       if (i >> 9 == s->size) complete = 1;
+
+                       debug_output(i,s->size << 9);
+                       
+                       if ((submit_events % 10 == 0) || complete) 
+                               drv->td_submit(s);
+                       timeout.tv_usec = 0;
+                       
+               } else {
+                       timeout.tv_usec = 1000;
+                       if (!submit_events) running = 0;
+               }
+               
+
+               /*Check AIO FD*/
+               LOCAL_FD_SET(&readfds);
+                ret = select(maxfds + 1, &readfds, (fd_set *) 0,
+                             (fd_set *) 0, &timeout);
+                            
+               if (ret > 0) drv->td_do_callbacks(s, 0);
+               if (complete && (returned_events == submit_events)) 
+                       running = 0;
+       }
+       memcpy(output+prev+1,"=",1);
+       DFPRINTF("\r%s     100%%\nTRANSFER COMPLETE\n\n", output);
+        drv->td_close(s);
+        free(s->private);
+        free(s);
+               
+       return 0;
+}
diff --git a/tools/blktap/drivers/qcow-create.c b/tools/blktap/drivers/qcow-create.c
new file mode 100644 (file)
index 0000000..be47393
--- /dev/null
@@ -0,0 +1,80 @@
+/* qcow-create.c
+ *
+ * Generates a qcow format disk.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include "tapdisk.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+
+int main(int argc, char *argv[])
+{
+       int ret = -1;
+       uint64_t size;
+
+       if ( (argc < 3) || (argc > 4) ) {
+               fprintf(stderr, "Qcow-utils: v1.0.0\n");
+               fprintf(stderr, 
+                       "usage: %s <SIZE(MB)> <FILENAME> "
+                       "[<BACKING_FILENAME>]\n", 
+                       argv[0]);
+               exit(-1);
+       }
+
+       size = atoi(argv[1]);
+       size = size << 20;
+       DFPRINTF("Creating file size %llu\n",(long long unsigned)size);
+       switch(argc) {
+       case 3: 
+               ret = qcow_create(argv[2],size,NULL,0);
+               break;
+       case 4:
+               ret = qcow_create(argv[2],size,argv[3],0);
+               break;          
+       }
+       if (ret < 0) DPRINTF("Unable to create QCOW file\n");
+       else DPRINTF("QCOW file successfully created\n");
+
+       return 0;
+}
diff --git a/tools/blktap/drivers/qcow2raw.c b/tools/blktap/drivers/qcow2raw.c
new file mode 100644 (file)
index 0000000..a7abc1b
--- /dev/null
@@ -0,0 +1,346 @@
+/* qcow2raw.c
+ *
+ * Generates raw image data from an existing qcow image
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include "tapdisk.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define TAPDISK 1
+#define BLOCK_PROCESSSZ 4096
+
+static int maxfds, *qcowio_fd, *aio_fd, running = 1, complete = 0; 
+static int read_complete = 0, write_complete = 0;
+static int returned_read_events = 0, returned_write_events = 0;
+static int submit_events = 0;
+static uint32_t read_idx = 0, write_idx = 0;
+struct tap_disk *drv1, *drv2;
+struct td_state *sqcow, *saio;
+static uint64_t prev = 0, written = 0;
+static char output[25];
+
+void print_bytes(void *ptr, int length) {
+
+  int i,k;
+  unsigned char *p = ptr;
+
+    DFPRINTF("Buf dump, length %d:\n",length);
+    for (k = 0; k < length; k++) {
+        DFPRINTF("%x",*p);
+        *p++;
+       if (k % 16 == 0) DFPRINTF("\n");
+        else if (k % 2 == 0) DFPRINTF(" ");    
+    }
+    DFPRINTF("\n");
+    return;
+}
+
+void debug_output(uint64_t progress, uint64_t size)
+{
+       /*Output progress every 5% */   
+       uint64_t blocks = size/20;
+
+       if (progress/blocks > prev) {
+               memcpy(output+prev+1,"=>",2);
+               prev++;
+               DFPRINTF("\r%s     %llu%%", 
+                       output, (long long)((prev-1)*5));
+       }
+       return;
+}
+
+static inline void LOCAL_FD_SET(fd_set *readfds) 
+{
+       FD_SET(qcowio_fd[0], readfds);
+       FD_SET(aio_fd[0], readfds);
+       
+       maxfds = (qcowio_fd[0] > aio_fd[0] ? qcowio_fd[0] : aio_fd[0]) + 1;
+       
+       return;
+}
+
+static int send_write_responses(struct td_state *s, int res, int idx, void *private)
+{
+       if (res < 0) {
+               DFPRINTF("AIO FAILURE: res [%d]!\n",res);
+               return 0;
+       }
+       written += BLOCK_PROCESSSZ;
+       returned_write_events++;
+       write_idx = idx;
+       if (complete && (returned_write_events == submit_events)) 
+               write_complete = 1;
+
+       debug_output(written, s->size << 9);
+       free(private);
+       return 0;
+}
+
+static int send_read_responses(struct td_state *s, int res, int idx, void *private)
+{
+       int ret;
+
+       if (res < 0) DFPRINTF("AIO FAILURE: res [%d]!\n",res);
+       
+       returned_read_events++;
+       read_idx = idx;
+       if (complete && (returned_read_events == submit_events)) 
+               read_complete = 1;
+       
+       ret = drv2->td_queue_write(saio, idx, BLOCK_PROCESSSZ>>9, private, 
+                                  send_write_responses, idx, private);
+       if (ret != 0) {
+               DFPRINTF("ERROR in submitting queue write!\n");
+               return 0;
+       }
+
+       if ( (complete && returned_read_events == submit_events) || 
+            (returned_read_events % 10 == 0) ) {
+               drv2->td_submit(saio);
+       }
+
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       int ret = -1, fd, len,input;
+       long int size;
+       fd_set readfds;
+       struct timeval timeout;
+       uint64_t i;
+       char *buf;
+       struct stat finfo;
+
+       if (argc != 3) {
+               fprintf(stderr, "Qcow-utils: v1.0.0\n");
+               fprintf(stderr, "usage: %s <Dest File descriptor> "
+                       "<Qcow SRC IMAGE>\n", 
+                      argv[0]);
+               exit(-1);
+       }
+
+       sqcow = malloc(sizeof(struct td_state));
+       saio  = malloc(sizeof(struct td_state));
+       
+       /*Open qcow source file*/       
+       drv1 = &tapdisk_qcow;
+       sqcow->private = malloc(drv1->private_data_size);
+
+        if (drv1->td_open(sqcow, argv[2])!=0) {
+               DFPRINTF("Unable to open Qcow file [%s]\n",argv[2]);
+               exit(-1);
+       } else DFPRINTF("QCOW file opened, size %llu\n",
+                     (long long unsigned)sqcow->size);
+
+       qcowio_fd = drv1->td_get_fd(sqcow);
+
+        /*Setup aio destination file*/
+       ret = stat(argv[1],&finfo);
+       if (ret == -1) {
+               /*Check errno*/
+               switch(errno) {
+               case ENOENT:
+                       /*File doesn't exist, create*/
+                       fd = open(argv[1], 
+                                 O_RDWR | O_LARGEFILE | O_CREAT, 0644);
+                       if (fd < 0) {
+                               DFPRINTF("ERROR creating file [%s] "
+                                        "(errno %d)\n",
+                                      argv[1], 0 - errno);
+                               exit(-1);
+                       }
+                       if (ftruncate(fd, (off_t)sqcow->size<<9) < 0) {
+                               DFPRINTF("Unable to create file "
+                                       "[%s] of size %llu (errno %d). "
+                                        "Exiting...\n",
+                                       argv[1], 
+                                       (long long unsigned)sqcow->size<<9, 
+                                       0 - errno);
+                               close(fd);
+                               exit(-1);
+                       }
+                       close(fd);
+                       break;
+               case  ENXIO:
+                       DFPRINTF("ERROR Device [%s] does not exist\n",argv[1]);
+                       exit(-1);
+               default: 
+                       DFPRINTF("An error occurred opening Device [%s] "
+                                "(errno %d)\n",
+                              argv[1], 0 - errno);
+                       exit(-1);
+               }
+       } else {                
+               fprintf(stderr, "WARNING: All existing data in "
+                       "%s will be overwritten.\nDo you wish to continue? "
+                       "(y or n)  ",
+                       argv[1]);
+               if (getchar() != 'y') {
+                       DFPRINTF("Exiting...\n");
+                       exit(-1);
+               }
+               
+               /*TODO - Test the existing file or device for adequate space*/
+               fd = open(argv[1], O_RDWR | O_LARGEFILE);
+               if (fd < 0) {
+                       DFPRINTF("ERROR: opening file [%s] (errno %d)\n",
+                              argv[1], 0 - errno);
+                       exit(-1);
+               }
+
+               if (S_ISBLK(finfo.st_mode)) {
+                       if(ioctl(fd,BLKGETSIZE,&size)!=0) {
+                               DFPRINTF("ERROR: BLKGETSIZE failed, "
+                                       "couldn't stat image [%s]\n", 
+                                       argv[1]);
+                               close(fd);
+                               exit(-1);
+                       }
+                       if (size < sqcow->size<<9) {
+                               DFPRINTF("ERROR: Not enough space on device "
+                                       "%s (%lu bytes available, %llu bytes required\n",
+                                       argv[1], size, 
+                                       (long long unsigned)sqcow->size<<9);
+                               close(fd);
+                               exit(-1);                               
+                       }
+               } else {
+                       if (ftruncate(fd, (off_t)sqcow->size<<9) < 0) {
+                               DFPRINTF("Unable to create file "
+                                       "[%s] of size %llu (errno %d). "
+                                        "Exiting...\n",
+                                       argv[1], 
+                                       (long long unsigned)sqcow->size<<9, 
+                                        0 - errno);
+                               close(fd);
+                               exit(-1);
+                       } else DFPRINTF("File [%s] truncated to length %llu "
+                                       "(%llu)\n", 
+                                      argv[1], 
+                                      (long long unsigned)sqcow->size<<9, 
+                                      (long long unsigned)sqcow->size);
+               }
+               close(fd);
+       }
+
+       /*Open aio destination file*/   
+       drv2 = &tapdisk_aio;
+       saio->private = malloc(drv2->private_data_size);
+
+        if (drv2->td_open(saio, argv[1])!=0) {
+               DFPRINTF("Unable to open Qcow file [%s]\n", argv[1]);
+               exit(-1);
+       }
+
+       aio_fd = drv2->td_get_fd(saio);
+
+       /*Initialise the output string*/
+       memset(output,0x20,25);
+       output[0] = '[';
+       output[22] = ']';
+       output[23] = '\0';
+       DFPRINTF("%s",output);
+
+       i = 0;
+       while (running) {
+               timeout.tv_sec = 0;
+               
+               if (!complete) {
+                       /*Read Pages from qcow image*/
+                       if ( (ret = posix_memalign((void **)&buf, 
+                                                  BLOCK_PROCESSSZ, 
+                                                  BLOCK_PROCESSSZ))
+                            != 0) {
+                               DFPRINTF("Unable to alloc memory (%d)\n",ret);
+                               exit(-1);                               
+                       }
+               
+                       /*Attempt to read 4k sized blocks*/
+                       ret = drv1->td_queue_read(sqcow, i>>9,
+                                                 BLOCK_PROCESSSZ>>9, buf, 
+                                                 send_read_responses, i>>9, buf);
+
+                       if (ret < 0) {
+                               DFPRINTF("UNABLE TO READ block [%llu]\n",
+                                      (long long unsigned)i);
+                               exit(-1);
+                       } else {
+                               i += BLOCK_PROCESSSZ;
+                               submit_events++;
+                       }
+
+                       if (i >= sqcow->size<<9) {
+                               complete = 1;
+                       }
+                       
+                       if ((submit_events % 10 == 0) || complete) 
+                               drv1->td_submit(sqcow);
+                       timeout.tv_usec = 0;
+                       
+               } else {
+                       timeout.tv_usec = 1000;
+                       if (!submit_events) running = 0;
+               }
+               
+
+               /*Check AIO FD*/
+               LOCAL_FD_SET(&readfds);
+                ret = select(maxfds + 1, &readfds, (fd_set *) 0,
+                             (fd_set *) 0, &timeout);
+                            
+               if (ret > 0) {
+                       if (FD_ISSET(qcowio_fd[0], &readfds)) 
+                               drv1->td_do_callbacks(sqcow, 0);
+                       if (FD_ISSET(aio_fd[0], &readfds)) 
+                               drv2->td_do_callbacks(saio, 0);
+               }
+               if (complete && (returned_write_events == submit_events)) 
+                       running = 0;
+       }
+       memcpy(output+prev+1,"=",1);
+       DFPRINTF("\r%s     100%%\nTRANSFER COMPLETE\n\n", output);
+               
+       return 0;
+}
diff --git a/tools/blktap/drivers/tapdisk.c b/tools/blktap/drivers/tapdisk.c
new file mode 100644 (file)
index 0000000..f817a89
--- /dev/null
@@ -0,0 +1,671 @@
+/* tapdisk.c
+ *
+ * separate disk process, spawned by blktapctrl. Inherits code from driver 
+ * plugins
+ * 
+ * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
+ *
+ */
+
+#define MSG_SIZE 4096
+#define TAPDISK
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/poll.h>
+#include <unistd.h>
+#include <errno.h>
+#include <pthread.h>
+#include <time.h>
+#include <err.h>
+#include <poll.h>
+#include <sys/statvfs.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include "blktaplib.h"
+#include "tapdisk.h"
+
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif 
+
+#define INPUT 0
+#define OUTPUT 1
+
+static int maxfds, fds[2], run = 1;
+
+static pid_t process;
+int connected_disks = 0;
+fd_list_entry_t *fd_start = NULL;
+
+void usage(void) 
+{
+       fprintf(stderr, "blktap-utils: v1.0.0\n");
+       fprintf(stderr, "usage: tapdisk <READ fifo> <WRITE fifo>\n");
+        exit(-1);
+}
+
+void daemonize(void)
+{
+       int i;
+
+       if (getppid()==1) return; /* already a daemon */
+       if (fork() != 0) exit(0);
+
+#if 0
+       /*Set new program session ID and close all descriptors*/
+       setsid();
+       for (i = getdtablesize(); i >= 0; --i) close(i);
+
+       /*Send all I/O to /dev/null */
+       i = open("/dev/null",O_RDWR);
+       dup(i); 
+       dup(i);
+#endif
+       return;
+}
+
+static void unmap_disk(struct td_state *s)
+{
+       tapdev_info_t *info = s->ring_info;
+       struct tap_disk *drv = s->drv;
+       fd_list_entry_t *ptr, *prev;
+
+       drv->td_close(s);
+
+       if (info != NULL && info->mem > 0) 
+               munmap(info->mem, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE);
+
+       ptr = s->fd_entry;
+       prev = ptr->prev;
+
+       if (prev) {
+               /*There are entries earlier in the list*/
+               prev->next = ptr->next;
+               if (ptr->next) {
+                       ptr = ptr->next;
+                       ptr->prev = prev;
+               }
+       } else {
+               /*We are the first entry in list*/
+               if (ptr->next) {
+                       ptr = ptr->next;
+                       fd_start = ptr;
+                       ptr->prev = NULL;
+               } else fd_start = NULL;
+       }
+
+       close(info->fd);
+
+       free(s->fd_entry);
+       free(s->blkif);
+       free(s->ring_info);
+       free(s);
+
+       return;
+
+}
+
+void sig_handler(int sig)
+{
+       /*Received signal to close. If no disks are active, we close app.*/
+
+       if (connected_disks < 1) run = 0;       
+}
+
+static inline int LOCAL_FD_SET(fd_set *readfds)
+{
+       fd_list_entry_t *ptr;
+       int i;
+
+       ptr = fd_start;
+       while (ptr != NULL) {
+               if (ptr->tap_fd) {
+                       FD_SET(ptr->tap_fd, readfds);
+                       for (i = 0; i < MAX_IOFD; i++) {
+                               if (ptr->io_fd[i]) 
+                                       FD_SET(ptr->io_fd[i], readfds);
+                               maxfds = (ptr->io_fd[i] > maxfds ? 
+                                         ptr->io_fd[i]: maxfds);
+                       }
+                       maxfds = (ptr->tap_fd > maxfds ? ptr->tap_fd: maxfds);
+               }
+               ptr = ptr->next;
+       }
+
+       return 0;
+}
+
+static inline fd_list_entry_t *add_fd_entry(int tap_fd, int io_fd[MAX_IOFD], struct td_state *s)
+{
+       fd_list_entry_t *ptr, *last, *entry;
+       int i;
+       DPRINTF("Adding fd_list_entry\n");
+
+       /*Add to linked list*/
+       s->fd_entry = entry = malloc(sizeof(fd_list_entry_t));
+       entry->tap_fd = tap_fd;
+       for (i = 0; i < MAX_IOFD; i++) entry->io_fd[i] = io_fd[i];
+       entry->s = s;
+       entry->next = NULL;
+
+       ptr = fd_start;
+       if (ptr == NULL) {
+               /*We are the first entry*/
+               fd_start = entry;
+               entry->prev = NULL;
+               goto finish;
+       }
+
+       while (ptr != NULL) {
+               last = ptr;
+               ptr = ptr->next;
+       }
+       last->next = entry;
+       entry->prev = last;
+
+ finish:
+       return entry;
+}
+
+static inline struct td_state *get_state(int cookie)
+{
+       fd_list_entry_t *ptr;
+
+       ptr = fd_start;
+       while (ptr != NULL) {
+               if (ptr->cookie == cookie) return ptr->s;
+               ptr = ptr->next;
+       }
+       return NULL;
+}
+
+static struct tap_disk *get_driver(int drivertype)
+{
+       /* blktapctrl has passed us the driver type */
+       
+       return dtypes[drivertype]->drv;
+}
+
+static struct td_state *state_init(void)
+{
+       int i;
+       struct td_state *s;
+       blkif_t *blkif;
+
+       s = malloc(sizeof(struct td_state));
+       blkif = s->blkif = malloc(sizeof(blkif_t));
+       s->ring_info = malloc(sizeof(tapdev_info_t));
+
+       for (i = 0; i < MAX_REQUESTS; i++)
+               blkif->pending_list[i].count = 0;
+
+       return s;
+}
+
+static int map_new_dev(struct td_state *s, int minor)
+{
+       int tap_fd;
+       tapdev_info_t *info = s->ring_info;
+       char *devname;
+       fd_list_entry_t *ptr;
+
+       asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor);
+       tap_fd = open(devname, O_RDWR);
+       if (tap_fd == -1) 
+       {
+               DPRINTF("open failed on dev %s!",devname);
+               goto fail;
+       } 
+       info->fd = tap_fd;
+
+       /*Map the shared memory*/
+       info->mem = mmap(0, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE, 
+                         PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0);
+       if ((long int)info->mem == -1) 
+       {
+               DPRINTF("mmap failed on dev %s!\n",devname);
+               goto fail;
+       }
+
+       /* assign the rings to the mapped memory */ 
+       info->sring = (blkif_sring_t *)((unsigned long)info->mem);
+       BACK_RING_INIT(&info->fe_ring, info->sring, PAGE_SIZE);
+       
+       info->vstart = 
+               (unsigned long)info->mem + (BLKTAP_RING_PAGES << PAGE_SHIFT);
+
+       ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process );
+       ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
+       free(devname);
+
+       /*Update the fd entry*/
+       ptr = fd_start;
+       while (ptr != NULL) {
+               if (s == ptr->s) {
+                       ptr->tap_fd = tap_fd;
+                       break;
+               }
+               ptr = ptr->next;
+       }       
+
+       return minor;
+
+ fail:
+       free(devname);
+       return -1;
+}
+
+static int read_msg(char *buf)
+{
+       int length, len, msglen, tap_fd, *io_fd;
+       char *ptr, *path;
+       image_t *img;
+       struct timeval timeout;
+       msg_hdr_t *msg;
+       msg_newdev_t *msg_dev;
+       msg_pid_t *msg_pid;
+       struct tap_disk *drv;
+       int ret = -1;
+       struct td_state *s = NULL;
+       fd_list_entry_t *entry;
+
+       length = read(fds[READ], buf, MSG_SIZE);
+
+       if (length > 0 && length >= sizeof(msg_hdr_t)) 
+       {
+               msg = (msg_hdr_t *)buf;
+               DPRINTF("Tapdisk: Received msg, len %d, type %d, UID %d\n",
+                       length,msg->type,msg->cookie);
+
+               switch (msg->type) {
+               case CTLMSG_PARAMS:                     
+                       ptr = buf + sizeof(msg_hdr_t);
+                       len = (length - sizeof(msg_hdr_t));
+                       path = calloc(1, len);
+                       
+                       memcpy(path, ptr, len); 
+                       DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path);
+
+                       /*Assign driver*/
+                       drv = get_driver(msg->drivertype);
+                       if (drv == NULL)
+                               goto params_done;
+                               
+                       DPRINTF("Loaded driver: name [%s], type [%d]\n",
+                               drv->disk_type, msg->drivertype);
+
+                       /* Allocate the disk structs */
+                       s = state_init();
+                       if (s == NULL)
+                               goto params_done;
+
+                       s->drv = drv;
+                       s->private = malloc(drv->private_data_size);
+                       if (s->private == NULL) {
+                               free(s);
+                               goto params_done;
+                       }
+
+                       /*Open file*/
+                       ret = drv->td_open(s, path);
+                       io_fd = drv->td_get_fd(s);
+
+                       entry = add_fd_entry(0, io_fd, s);
+                       entry->cookie = msg->cookie;
+                       DPRINTF("Entered cookie %d\n",entry->cookie);
+                       
+                       memset(buf, 0x00, MSG_SIZE); 
+                       
+               params_done:
+                       if (ret == 0) {
+                               msglen = sizeof(msg_hdr_t) + sizeof(image_t);
+                               msg->type = CTLMSG_IMG;
+                               img = (image_t *)(buf + sizeof(msg_hdr_t));
+                               img->size = s->size;
+                               img->secsize = s->sector_size;
+                               img->info = s->info;
+                       } else {
+                               msglen = sizeof(msg_hdr_t);
+                               msg->type = CTLMSG_IMG_FAIL;
+                               msg->len = msglen;
+                       }
+                       len = write(fds[WRITE], buf, msglen);
+                       free(path);
+                       return 1;
+                       
+                       
+                       
+               case CTLMSG_NEWDEV:
+                       msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
+
+                       s = get_state(msg->cookie);
+                       DPRINTF("Retrieving state, cookie %d.....[%s]\n",msg->cookie, (s == NULL ? "FAIL":"OK"));
+                       if (s != NULL) {
+                               ret = ((map_new_dev(s, msg_dev->devnum) 
+                                       == msg_dev->devnum ? 0: -1));
+                               connected_disks++;
+                       }       
+
+                       memset(buf, 0x00, MSG_SIZE); 
+                       msglen = sizeof(msg_hdr_t);
+                       msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP 
+                                             : CTLMSG_NEWDEV_FAIL);
+                       msg->len = msglen;
+
+                       len = write(fds[WRITE], buf, msglen);
+                       return 1;
+
+               case CTLMSG_CLOSE:
+                       s = get_state(msg->cookie);
+                       if (s) unmap_disk(s);
+                       
+                       connected_disks--;
+                       sig_handler(SIGINT);
+
+                       return 1;                       
+
+               case CTLMSG_PID:
+                       memset(buf, 0x00, MSG_SIZE);
+                       msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t);
+                       msg->type = CTLMSG_PID_RSP;
+                       msg->len = msglen;
+
+                       msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t));
+                       process = getpid();
+                       msg_pid->pid = process;
+
+                       len = write(fds[WRITE], buf, msglen);
+                       return 1;
+
+               default:
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+static inline int write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp)
+{
+       tapdev_info_t *info = s->ring_info;
+       blkif_response_t *rsp_d;
+       
+       rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt);
+       memcpy(rsp_d, rsp, sizeof(blkif_response_t));
+       wmb();
+       info->fe_ring.rsp_prod_pvt++;
+       
+       return 0;
+}
+
+static inline void kick_responses(struct td_state *s)
+{
+       tapdev_info_t *info = s->ring_info;
+
+       if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) 
+       {
+               RING_PUSH_RESPONSES(&info->fe_ring);
+               ioctl(info->fd, BLKTAP_IOCTL_KICK_FE);
+       }
+}
+
+void io_done(struct td_state *s, int sid)
+{
+       struct tap_disk *drv = s->drv;
+
+       if (!run) return; /*We have received signal to close*/
+
+       if (drv->td_do_callbacks(s, sid) > 0) kick_responses(s);
+
+       return;
+}
+
+int send_responses(struct td_state *s, int res, int idx, void *private)
+{
+       blkif_request_t *req;
+       int responses_queued = 0;
+       blkif_t *blkif = s->blkif;
+
+       req   = &blkif->pending_list[idx].req;
+                       
+       if ( (idx > MAX_REQUESTS-1) || 
+           (blkif->pending_list[idx].count == 0) )
+       {
+               DPRINTF("invalid index returned(%u)!\n", idx);
+               return 0;
+       }
+       
+       if (res != 0) {
+               DPRINTF("*** request error %d! \n", res);
+               return 0;
+       }
+
+       blkif->pending_list[idx].count--;
+       
+       if (blkif->pending_list[idx].count == 0) 
+       {
+               blkif_request_t tmp;
+               blkif_response_t *rsp;
+               
+               tmp = blkif->pending_list[idx].req;
+               rsp = (blkif_response_t *)req;
+               
+               rsp->id = tmp.id;
+               rsp->operation = tmp.operation;
+               rsp->status = blkif->pending_list[idx].status;
+               
+               write_rsp_to_ring(s, rsp);
+               responses_queued++;
+       }
+       return responses_queued;
+}
+
+static void get_io_request(struct td_state *s)
+{
+       RING_IDX          rp, rc, j, i, ret;
+       blkif_request_t  *req;
+       int idx, nsects;
+       uint64_t sector_nr;
+       char *page;
+       int early = 0; /* count early completions */
+       struct tap_disk *drv = s->drv;
+       blkif_t *blkif = s->blkif;
+       tapdev_info_t *info = s->ring_info;
+
+       if (!run) return; /*We have received signal to close*/
+
+       rp = info->fe_ring.sring->req_prod; 
+       rmb();
+       for (j = info->fe_ring.req_cons; j != rp; j++)
+       {
+               int done = 0; 
+
+               req = NULL;
+               req = RING_GET_REQUEST(&info->fe_ring, j);
+               ++info->fe_ring.req_cons;
+               
+               if (req == NULL) continue;
+               
+               idx = req->id;
+               ASSERT(blkif->pending_list[idx].count == 0);
+               memcpy(&blkif->pending_list[idx].req, req, sizeof(*req));
+               blkif->pending_list[idx].status = BLKIF_RSP_OKAY;
+               blkif->pending_list[idx].count = req->nr_segments;
+
+               sector_nr = req->sector_number;
+
+               for (i = 0; i < req->nr_segments; i++) {
+                       nsects = req->seg[i].last_sect - 
+                                req->seg[i].first_sect + 1;
+       
+                       if ((req->seg[i].last_sect >= PAGE_SIZE >> 9) ||
+                           (nsects <= 0))
+                               continue;
+
+                       page  = (char *)MMAP_VADDR(info->vstart, 
+                                                  (unsigned long)req->id, i);
+                       page += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+                       if (sector_nr >= s->size) {
+                               DPRINTF("Sector request failed:\n");
+                               DPRINTF("%s request, idx [%d,%d] size [%llu], "
+                                       "sector [%llu,%llu]\n",
+                                       (req->operation == BLKIF_OP_WRITE ? 
+                                        "WRITE" : "READ"),
+                                       idx,i,
+                                       (long long unsigned) 
+                                               nsects<<SECTOR_SHIFT,
+                                       (long long unsigned) 
+                                               sector_nr<<SECTOR_SHIFT,
+                                       (long long unsigned) sector_nr);
+                               continue;
+                       }
+                       
+                       switch (req->operation) 
+                       {
+                       case BLKIF_OP_WRITE:
+                               ret = drv->td_queue_write(s, sector_nr,
+                                               nsects, page, send_responses, 
+                                               idx, NULL);
+                               if (ret > 0) early += ret;
+                               else if (ret == -EBUSY) {
+                                       /*
+                                        * TODO: Sector is locked         *
+                                        * Need to put req back on queue  *
+                                        */
+                               }
+                               break;
+                       case BLKIF_OP_READ:
+                               ret = drv->td_queue_read(s, sector_nr,
+                                               nsects, page, send_responses, 
+                                               idx, NULL);
+                               if (ret > 0) early += ret;
+                               else if (ret == -EBUSY) {
+                                       /*
+                                        * TODO: Sector is locked         *
+                                        * Need to put req back on queue  *
+                                        */
+                               }
+                               break;
+                       default:
+                               DPRINTF("Unknown block operation\n");
+                               break;
+                       }
+                       sector_nr += nsects;
+               }
+       }
+
+       /*Batch done*/
+       drv->td_submit(s);
+       
+       if (early > 0) 
+               io_done(s,10);
+               
+       return;
+}
+
+int main(int argc, char *argv[])
+{
+       int len, msglen, ret, i;
+       char *p, *buf;
+       fd_set readfds, writefds;
+       struct timeval timeout;
+       fd_list_entry_t *ptr;
+       struct tap_disk *drv;
+       struct td_state *s;
+       
+       if (argc != 3) usage();
+
+       daemonize();
+
+       openlog("TAPDISK", LOG_CONS|LOG_ODELAY, LOG_DAEMON);
+       /*Setup signal handlers*/
+       signal (SIGBUS, sig_handler);
+       signal (SIGINT, sig_handler);
+
+       /*Open the control channel*/
+       fds[READ] = open(argv[1],O_RDWR|O_NONBLOCK);
+       fds[WRITE] = open(argv[2],O_RDWR|O_NONBLOCK);
+
+       if ( (fds[READ] < 0) || (fds[WRITE] < 0) ) 
+       {
+               DPRINTF("FD open failed [%d,%d]\n",fds[READ], fds[WRITE]);
+               exit(-1);
+       }
+
+       buf = calloc(MSG_SIZE, 1);
+
+       if (buf == NULL) 
+        {
+               DPRINTF("ERROR: allocating memory.\n");
+               exit(-1);
+       }
+
+       while (run) 
+        {
+               ret = 0;
+               FD_ZERO(&readfds);
+               FD_SET(fds[READ], &readfds);
+               maxfds = fds[READ];
+
+               /*Set all tap fds*/
+               LOCAL_FD_SET(&readfds);
+
+               timeout.tv_sec = 0; 
+               timeout.tv_usec = 1000; 
+
+               /*Wait for incoming messages*/
+               ret = select(maxfds + 1, &readfds, (fd_set *) 0, 
+                            (fd_set *) 0, &timeout);
+
+               if (ret > 0) 
+                {
+                       ptr = fd_start;
+                       while (ptr != NULL) {
+                               if (FD_ISSET(ptr->tap_fd, &readfds)) 
+                                       get_io_request(ptr->s);
+                               for (i = 0; i < MAX_IOFD; i++) {
+                                       if (ptr->io_fd[i] && 
+                                          FD_ISSET(ptr->io_fd[i], &readfds)) 
+                                               io_done(ptr->s, i);
+                               }
+
+                               ptr = ptr->next;
+                       }
+
+                       if (FD_ISSET(fds[READ], &readfds))
+                               read_msg(buf);
+               }
+       }
+       free(buf);
+       close(fds[READ]);
+       close(fds[WRITE]);
+
+       ptr = fd_start;
+       while (ptr != NULL) {
+               s = ptr->s;
+               drv = s->drv;
+
+               unmap_disk(s);
+               drv->td_close(s);
+               free(s->private);
+               free(s->blkif);
+               free(s->ring_info);
+               free(s);
+               close(ptr->tap_fd);
+               ptr = ptr->next;
+       }
+       closelog();
+
+       return 0;
+}
diff --git a/tools/blktap/drivers/tapdisk.h b/tools/blktap/drivers/tapdisk.h
new file mode 100644 (file)
index 0000000..1f03156
--- /dev/null
@@ -0,0 +1,211 @@
+/* tapdisk.h
+ *
+ * Generic disk interface for blktap-based image adapters.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ * 
+ * Some notes on the tap_disk interface:
+ * 
+ * tap_disk aims to provide a generic interface to easily implement new 
+ * types of image accessors.  The structure-of-function-calls is similar
+ * to disk interfaces used in qemu/denali/etc, with the significant 
+ * difference being the expectation of asynchronous rather than synchronous 
+ * I/O.  The asynchronous interface is intended to allow lots of requests to
+ * be pipelined through a disk, without the disk requiring any of its own
+ * threads of control.  As such, a batch of requests is delivered to the disk
+ * using:
+ * 
+ *    td_queue_[read,write]()
+ * 
+ * and passing in a completion callback, which the disk is responsible for 
+ * tracking.  The end of a back is marked with a call to:
+ * 
+ *    td_submit()
+ * 
+ * The disk implementation must provide a file handle, which is used to 
+ * indicate that it needs to do work.  tapdisk will add this file handle 
+ * (returned from td_get_fd()) to it's poll set, and will call into the disk
+ * using td_do_callbacks() whenever there is data pending.
+ * 
+ * Two disk implementations demonstrate how this interface may be used to 
+ * implement disks with both asynchronous and synchronous calls.  block-aio.c
+ * maps this interface down onto the linux libaio calls, while block-sync uses 
+ * normal posix read/write.
+ * 
+ * A few things to realize about the sync case, which doesn't need to defer 
+ * io completions:
+ * 
+ *   - td_queue_[read,write]() call read/write directly, and then call the 
+ *     callback immediately.  The MUST then return a value greater than 0
+ *     in order to tell tapdisk that requests have finished early, and to 
+ *     force responses to be kicked to the clents.
+ * 
+ *   - The fd used for poll is an otherwise unused pipe, which allows poll to 
+ *     be safely called without ever returning anything.
+ * 
+ */
+
+#ifndef TAPDISK_H_
+#define TAPDISK_H_
+
+#include <stdint.h>
+#include <syslog.h>
+#include "blktaplib.h"
+
+/*If enabled, log all debug messages to syslog*/
+#if 1
+#define DPRINTF(_f, _a...) syslog( LOG_DEBUG, _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* Things disks need to know about, these should probably be in a higher-level
+ * header. */
+#define MAX_REQUESTS            64
+#define MAX_SEGMENTS_PER_REQ    11
+#define SECTOR_SHIFT             9
+#define DEFAULT_SECTOR_SIZE    512
+
+/* This structure represents the state of an active virtual disk.           */
+struct td_state {
+       void *private;
+       void *drv;
+       void *blkif;
+       void *image;
+       void *ring_info;
+       void *fd_entry;
+       char backing_file[1024]; /*Used by differencing disks, e.g. qcow*/
+       long int   sector_size;
+       uint64_t   size;
+       long int   info;
+};
+
+/* Prototype of the callback to activate as requests complete.              */
+typedef int (*td_callback_t)(struct td_state *s, int res, int id, void *prv);
+
+/* Structure describing the interface to a virtual disk implementation.     */
+/* See note at the top of this file describing this interface.              */
+struct tap_disk {
+       const char *disk_type;
+       int private_data_size;
+       int (*td_open)        (struct td_state *s, const char *name);
+       int (*td_queue_read)  (struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *prv);
+       int (*td_queue_write) (struct td_state *s, uint64_t sector,
+                              int nb_sectors, char *buf, td_callback_t cb,
+                              int id, void *prv);
+       int (*td_submit)      (struct td_state *s);
+       int *(*td_get_fd)      (struct td_state *s);
+       int (*td_close)       (struct td_state *s);
+       int (*td_do_callbacks)(struct td_state *s, int sid);
+};
+
+typedef struct disk_info {
+       int  idnum;
+       char name[50];       /* e.g. "RAMDISK" */
+       char handle[10];     /* xend handle, e.g. 'ram' */
+       int  single_handler; /* is there a single controller for all */
+                            /* instances of disk type? */
+#ifdef TAPDISK
+       struct tap_disk *drv;   
+#endif
+} disk_info_t;
+
+void debug_fe_ring(struct td_state *s);
+
+extern struct tap_disk tapdisk_aio;
+extern struct tap_disk tapdisk_sync;
+extern struct tap_disk tapdisk_vmdk;
+extern struct tap_disk tapdisk_ram;
+extern struct tap_disk tapdisk_qcow;
+
+#define MAX_DISK_TYPES  20
+#define MAX_IOFD        2
+
+#define DISK_TYPE_AIO   0
+#define DISK_TYPE_SYNC  1
+#define DISK_TYPE_VMDK  2
+#define DISK_TYPE_RAM   3
+#define DISK_TYPE_QCOW  4
+
+
+/*Define Individual Disk Parameters here */
+static disk_info_t aio_disk = {
+       DISK_TYPE_AIO,
+       "raw image (aio)",
+       "aio",
+       0,
+#ifdef TAPDISK
+       &tapdisk_aio,
+#endif
+};
+
+static disk_info_t sync_disk = {
+       DISK_TYPE_SYNC,
+       "raw image (sync)",
+       "sync",
+       0,
+#ifdef TAPDISK
+       &tapdisk_sync,
+#endif
+};
+
+static disk_info_t vmdk_disk = {
+       DISK_TYPE_VMDK,
+       "vmware image (vmdk)",
+       "vmdk",
+       1,
+#ifdef TAPDISK
+       &tapdisk_vmdk,
+#endif
+};
+
+static disk_info_t ram_disk = {
+       DISK_TYPE_RAM,
+       "ramdisk image (ram)",
+       "ram",
+       1,
+#ifdef TAPDISK
+       &tapdisk_ram,
+#endif
+};
+
+static disk_info_t qcow_disk = {
+       DISK_TYPE_QCOW,
+       "qcow disk (qcow)",
+       "qcow",
+       0,
+#ifdef TAPDISK
+       &tapdisk_qcow,
+#endif
+};
+
+/*Main disk info array */
+static disk_info_t *dtypes[] = {
+       &aio_disk,
+       &sync_disk,
+       &vmdk_disk,
+       &ram_disk,
+       &qcow_disk,
+};
+
+typedef struct driver_list_entry {
+       void *blkif;
+       void *prev;
+       void *next;
+} driver_list_entry_t;
+
+typedef struct fd_list_entry {
+       int cookie;
+       int  tap_fd;
+       int  io_fd[MAX_IOFD];
+       struct td_state *s;
+       void *prev;
+       void *next;
+} fd_list_entry_t;
+
+int qcow_create(const char *filename, uint64_t total_size,
+               const char *backing_file, int flags);
+
+#endif /*TAPDISK_H_*/
diff --git a/tools/blktap/lib/Makefile b/tools/blktap/lib/Makefile
new file mode 100644 (file)
index 0000000..c0eb28b
--- /dev/null
@@ -0,0 +1,66 @@
+XEN_ROOT = ../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+MAJOR    = 3.0
+MINOR    = 0
+SONAME   = libblktap.so.$(MAJOR)
+
+BLKTAP_INSTALL_DIR = /usr/sbin
+
+INSTALL            = install
+INSTALL_PROG       = $(INSTALL) -m0755
+INSTALL_DIR        = $(INSTALL) -d -m0755
+
+INCLUDES += -I. -I.. -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
+
+LIBS     := -lz
+
+SRCS     :=
+SRCS     += xenbus.c blkif.c xs_api.c
+
+CFLAGS   += -Werror
+CFLAGS   += -Wno-unused
+CFLAGS   += -fno-strict-aliasing -fPIC
+CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+# get asprintf():
+CFLAGS   += -D _GNU_SOURCE
+
+# Get gcc to generate the dependencies for us.
+CFLAGS   += -Wp,-MD,.$(@F).d
+CFLAGS   += $(INCLUDES) 
+DEPS     = .*.d
+
+OBJS     = $(patsubst %.c,%.o,$(SRCS))
+IBINS   :=
+
+LIB      = libblktap.a libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
+
+all: build
+
+build:
+       $(MAKE) libblktap
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR)
+       $(INSTALL_DIR) -p $(DESTDIR)/usr/include
+       $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR)
+       $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
+
+clean:
+       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS
+
+libblktap: $(OBJS) 
+       $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared         \
+             -L$(XEN_XENSTORE) -l xenstore                       \
+             -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
+       ln -sf libblktap.so.$(MAJOR) $@.so
+       ar rc libblktap.a $@.so
+
+.PHONY: TAGS all build clean install libblktap
+
+TAGS:
+       etags -t $(SRCS) *.h
+
+-include $(DEPS)
+
diff --git a/tools/blktap/lib/blkif.c b/tools/blktap/lib/blkif.c
new file mode 100644 (file)
index 0000000..9a19596
--- /dev/null
@@ -0,0 +1,185 @@
+/*
+ * tools/blktap_user/blkif.c
+ * 
+ * The blkif interface for blktap.  A blkif describes an in-use virtual disk.
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <err.h>
+#include <unistd.h>
+
+#include "blktaplib.h"
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+       blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+       while ( (blkif != NULL) && 
+               ((blkif->domid != domid) || (blkif->handle != handle)) )
+               blkif = blkif->hash_next;
+       return blkif;
+}
+
+blkif_t *alloc_blkif(domid_t domid)
+{
+       blkif_t *blkif;
+       DPRINTF("Alloc_blkif called [%d]\n",domid);
+       blkif = (blkif_t *)malloc(sizeof(blkif_t));
+       if (!blkif)
+               return NULL;
+       memset(blkif, 0, sizeof(*blkif));
+       blkif->domid = domid;
+       blkif->devnum = -1;
+       return blkif;
+}
+
+/*Controller callbacks*/
+static int (*new_devmap_hook)(blkif_t *blkif) = NULL;
+void register_new_devmap_hook(int (*fn)(blkif_t *blkif))
+{
+       new_devmap_hook = fn;
+}
+
+static int (*new_unmap_hook)(blkif_t *blkif) = NULL;
+void register_new_unmap_hook(int (*fn)(blkif_t *blkif))
+{
+       new_unmap_hook = fn;
+}
+
+static int (*new_blkif_hook)(blkif_t *blkif) = NULL;
+void register_new_blkif_hook(int (*fn)(blkif_t *blkif))
+{
+       new_blkif_hook = fn;
+}
+
+int blkif_init(blkif_t *blkif, long int handle, long int pdev, 
+               long int readonly)
+{
+       domid_t domid;
+       blkif_t **pblkif;
+       int devnum;
+       
+       if (blkif == NULL)
+               return -EINVAL;
+       
+       domid = blkif->domid;
+       blkif->handle   = handle;
+       blkif->pdev     = pdev;
+       blkif->readonly = readonly;
+       
+       /*
+        * Call out to the new_blkif_hook. 
+        * The tap application should define this,
+        * and it should return having set blkif->ops
+        * 
+        */
+       if (new_blkif_hook == NULL)
+       {
+               DPRINTF("Probe detected a new blkif, but no new_blkif_hook!");
+               return -1;
+       }
+       if (new_blkif_hook(blkif)!=0) {
+               DPRINTF("BLKIF: Image open failed\n");
+               return -1;
+       }
+       
+       /* Now wire it in. */
+       pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+       DPRINTF("Created hash entry: %d [%d,%ld]\n", 
+               BLKIF_HASH(domid, handle), domid, handle);
+       
+       while ( *pblkif != NULL )
+       {
+               if ( ((*pblkif)->domid == domid) && 
+                    ((*pblkif)->handle == handle) )
+               {
+                       DPRINTF("Could not create blkif: already exists\n");
+                       return -1;
+               }
+               pblkif = &(*pblkif)->hash_next;
+       }
+       blkif->hash_next = NULL;
+       *pblkif = blkif;
+       
+       if (new_devmap_hook == NULL)
+       {
+               DPRINTF("Probe setting up new blkif but no devmap hook!");
+               return -1;
+       }
+       
+       devnum = new_devmap_hook(blkif);
+       if (devnum == -1)
+               return -1;
+       blkif->devnum = devnum;
+       
+       return 0;
+}
+
+void free_blkif(blkif_t *blkif)
+{
+       blkif_t **pblkif, *curs;
+       image_t *image;
+       
+       pblkif = &blkif_hash[BLKIF_HASH(blkif->domid, blkif->handle)];
+       while ( (curs = *pblkif) != NULL )
+       {
+               if ( blkif == curs )
+               {
+                       *pblkif = curs->hash_next;
+               }
+               pblkif = &curs->hash_next;
+       }
+       if (blkif != NULL) {
+               if ((image=(image_t *)blkif->prv)!=NULL) {
+                       free(blkif->prv);
+               }
+               if (blkif->info!=NULL) {
+                       free(blkif->info);
+               }
+               if (new_unmap_hook != NULL) new_unmap_hook(blkif);
+               free(blkif);
+       }
+}
+
+void __init_blkif(void)
+{    
+       memset(blkif_hash, 0, sizeof(blkif_hash));
+}
diff --git a/tools/blktap/lib/blktaplib.h b/tools/blktap/lib/blktaplib.h
new file mode 100644 (file)
index 0000000..ceab6b7
--- /dev/null
@@ -0,0 +1,223 @@
+/* blktaplib.h
+ *
+ * Blktap library userspace code.
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKTAPLIB_H__
+#define __BLKTAPLIB_H__
+
+#include <xenctrl.h>
+#include <sys/user.h>
+#include <xen/xen.h>
+#include <xen/io/blkif.h>
+#include <xen/io/ring.h>
+#include <xs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
+
+/* size of the extra VMA area to map in attached pages. */
+#define BLKTAP_VMA_PAGES BLK_RING_SIZE
+
+/* blktap IOCTLs: These must correspond with the blktap driver ioctls*/
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2
+#define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_SENDPID        4
+#define BLKTAP_IOCTL_NEWINTF        5
+#define BLKTAP_IOCTL_MINOR          6
+#define BLKTAP_IOCTL_MAJOR          7
+#define BLKTAP_QUERY_ALLOC_REQS      8
+#define BLKTAP_IOCTL_FREEINTF       9
+#define BLKTAP_IOCTL_PRINT_IDXS      100   
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+       return (
+               ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
+               ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+               ( arg == BLKTAP_MODE_INTERPOSE    ) );
+}
+
+#define MAX_REQUESTS            64
+
+#define BLKTAP_IOCTL_KICK 1
+#define MAX_PENDING_REQS 64
+#define BLKTAP_DEV_DIR   "/dev/xen"
+#define BLKTAP_DEV_NAME  "blktap"
+#define BLKTAP_DEV_MAJOR 254
+#define BLKTAP_DEV_MINOR 0
+
+#define BLKTAP_RING_PAGES       1 /* Front */
+#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES)
+
+struct blkif;
+
+typedef struct {
+       blkif_request_t  req;
+       struct blkif         *blkif;
+       int              count;
+        int16_t          status;
+} pending_req_t;
+
+struct blkif_ops {
+       long int (*get_size)(struct blkif *blkif);
+       long int (*get_secsize)(struct blkif *blkif);
+       unsigned (*get_info)(struct blkif *blkif);
+};
+
+typedef struct blkif {
+       domid_t domid;
+       long int handle;
+       
+       long int pdev;
+       long int readonly;
+       
+       enum { DISCONNECTED, DISCONNECTING, CONNECTED } state;
+       
+       struct blkif_ops *ops;
+       struct blkif *hash_next;
+       
+       void *prv;  /* device-specific data */
+       void *info; /*Image parameter passing */
+       pending_req_t    pending_list[MAX_REQUESTS];
+       int devnum;
+       int fds[2];
+       int be_id;
+       int major;
+       int minor;
+       pid_t tappid;
+       int drivertype;
+       uint16_t cookie;
+} blkif_t;
+
+typedef struct blkif_info {
+       char *params;
+} blkif_info_t;
+
+void register_new_devmap_hook(int (*fn)(blkif_t *blkif));
+void register_new_unmap_hook(int (*fn)(blkif_t *blkif));
+void register_new_blkif_hook(int (*fn)(blkif_t *blkif));
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+blkif_t *alloc_blkif(domid_t domid);
+int blkif_init(blkif_t *blkif, long int handle, long int pdev, 
+               long int readonly);
+void free_blkif(blkif_t *blkif);
+void __init_blkif(void);
+
+typedef struct tapdev_info {
+       int fd;
+       char *mem;
+       blkif_sring_t *sring;
+       blkif_back_ring_t  fe_ring;
+       unsigned long vstart;
+       blkif_t *blkif;
+} tapdev_info_t;
+
+typedef struct domid_translate {
+       unsigned short domid;
+       unsigned short busid;
+} domid_translate_t ;
+
+typedef struct image {
+       long int size;
+       long int secsize;
+       long int info;
+} image_t;
+
+typedef struct msg_hdr {
+       uint16_t    type;
+       uint16_t   len;
+       uint16_t   drivertype;
+       uint16_t   cookie;
+} msg_hdr_t;
+
+typedef struct msg_newdev {
+       uint8_t     devnum;
+       uint16_t    domid;
+} msg_newdev_t;
+
+typedef struct msg_pid {
+       pid_t     pid;
+} msg_pid_t;
+
+#define READ 0
+#define WRITE 1
+
+/*Control Messages between manager and tapdev*/
+#define CTLMSG_PARAMS      1
+#define CTLMSG_IMG         2
+#define CTLMSG_IMG_FAIL    3
+#define CTLMSG_NEWDEV      4
+#define CTLMSG_NEWDEV_RSP  5
+#define CTLMSG_NEWDEV_FAIL 6
+#define CTLMSG_CLOSE       7
+#define CTLMSG_CLOSE_RSP   8
+#define CTLMSG_PID         9
+#define CTLMSG_PID_RSP     10
+
+/* xenstore/xenbus: */
+extern int add_blockdevice_probe_watch(struct xs_handle *h, 
+                                       const char *domname);
+int xs_fire_next_watch(struct xs_handle *h);
+
+
+/* Abitrary values, must match the underlying driver... */
+#define MAX_PENDING_REQS 64
+#define MAX_TAP_DEV 100
+
+/* Accessing attached data page mappings */
+#define MMAP_PAGES                                              \
+    (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_vstart,_req,_seg)                                   \
+    ((_vstart) +                                              \
+     ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +    \
+     ((_seg) * PAGE_SIZE))
+
+/* Defines that are only used by library clients */
+
+#ifndef __COMPILING_BLKTAP_LIB
+
+static char *blkif_op_name[] = {
+       [BLKIF_OP_READ]       = "READ",
+       [BLKIF_OP_WRITE]      = "WRITE",
+};
+
+#endif /* __COMPILING_BLKTAP_LIB */
+
+#endif /* __BLKTAPLIB_H__ */
diff --git a/tools/blktap/lib/list.h b/tools/blktap/lib/list.h
new file mode 100644 (file)
index 0000000..bda5f46
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * list.h
+ * 
+ * This is a subset of linux's list.h intended to be used in user-space.
+ * 
+ */
+
+#ifndef __LIST_H__
+#define __LIST_H__
+
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+struct list_head {
+        struct list_head *next, *prev;
+};
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+#define LIST_HEAD(name) \
+        struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void __list_add(struct list_head *new,
+                              struct list_head *prev,
+                              struct list_head *next)
+{
+        next->prev = new;
+        new->next = next;
+        new->prev = prev;
+        prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+        __list_add(new, head, head->next);
+}
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+        next->prev = prev;
+        prev->next = next;
+}
+static inline void list_del(struct list_head *entry)
+{
+        __list_del(entry->prev, entry->next);
+        entry->next = LIST_POISON1;
+        entry->prev = LIST_POISON2;
+}
+#define list_entry(ptr, type, member)                                   \
+        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+#define list_for_each_entry(pos, head, member)                          \
+        for (pos = list_entry((head)->next, typeof(*pos), member);      \
+             &pos->member != (head);                                    \
+             pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#endif /* __LIST_H__ */
diff --git a/tools/blktap/lib/xenbus.c b/tools/blktap/lib/xenbus.c
new file mode 100644 (file)
index 0000000..91cdd00
--- /dev/null
@@ -0,0 +1,387 @@
+/*
+ * xenbus.c
+ * 
+ * xenbus interface to the blocktap.
+ * 
+ * this handles the top-half of integration with block devices through the
+ * store -- the tap driver negotiates the device channel etc, while the
+ * userland tap client needs to sort out the disk parameters etc.
+ * 
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <printf.h>
+#include <string.h>
+#include <err.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <xs.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <time.h>
+#include <sys/time.h>
+#include "blktaplib.h"
+#include "list.h"
+#include "xs_api.h"
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+struct backend_info
+{
+       /* our communications channel */
+       blkif_t *blkif;
+       
+       long int frontend_id;
+       long int pdev;
+       long int readonly;
+       
+       char *backpath;
+       char *frontpath;
+       
+       struct list_head list;
+};
+
+static LIST_HEAD(belist);
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+       unsigned int i;
+       
+       for (i = 0; str[i]; i++)
+               if (str[i] == c) {
+                       if (len == 0)
+                               return i;
+                       len--;
+               }
+       return (len == 0) ? i : -ERANGE;
+}
+
+static int get_be_id(const char *str)
+{
+       int len,end;
+       const char *ptr;
+       char *tptr, num[10];
+       
+       len = strsep_len(str, '/', 6);
+       end = strlen(str);
+       if( (len < 0) || (end < 0) ) return -1;
+       
+       ptr = str + len + 1;
+       strncpy(num, ptr, end - len);
+       tptr = num + (end - (len + 1));
+       *tptr = '\0';
+
+       return atoi(num);
+}
+
+static struct backend_info *be_lookup_be(const char *bepath)
+{
+       struct backend_info *be;
+       
+       list_for_each_entry(be, &belist, list)
+               if (strcmp(bepath, be->backpath) == 0)
+                       return be;
+       return (struct backend_info *)NULL;
+}
+
+static int be_exists_be(const char *bepath)
+{
+       return (be_lookup_be(bepath) != NULL);
+}
+
+static struct backend_info *be_lookup_fe(const char *fepath)
+{
+       struct backend_info *be;
+       
+       list_for_each_entry(be, &belist, list)
+               if (strcmp(fepath, be->frontpath) == 0)
+                       return be;
+       return (struct backend_info *)NULL;
+}
+
+static int backend_remove(struct xs_handle *h, struct backend_info *be)
+{
+       /* Unhook from be list. */
+       list_del(&be->list);
+
+       /* Free everything else. */
+       if (be->blkif) {
+               DPRINTF("Freeing blkif dev [%d]\n",be->blkif->devnum);
+               free_blkif(be->blkif);
+       }
+       if (be->frontpath)
+               free(be->frontpath);
+       if (be->backpath)
+               free(be->backpath);
+       free(be);
+       return 0;
+}
+
+static void ueblktap_setup(struct xs_handle *h, char *bepath)
+{
+       struct backend_info *be;
+       char *path = NULL, *p,*dev;
+       int len, er, deverr;
+       long int pdev = 0, handle;
+       blkif_info_t *blk;
+       
+       be = be_lookup_be(bepath);
+       if (be == NULL)
+       {
+               DPRINTF("ERROR: backend changed called for nonexistent "
+                       "backend! (%s)\n", bepath);
+               goto fail;
+       }
+
+        deverr = xs_gather(h, bepath, "physical-device", "%li", &pdev, NULL);
+        if (!deverr) {
+                DPRINTF("pdev set to %ld\n",pdev);
+                if (be->pdev && be->pdev != pdev) {
+                        DPRINTF("changing physical-device not supported");
+                        goto fail;
+                }
+                be->pdev = pdev;
+        }
+
+        /*Check to see if device is to be opened read-only*/
+        asprintf(&path, "%s/%s", bepath, "read-only");
+        if (xs_exists(h, path))
+                be->readonly = 1;
+
+        if (be->blkif == NULL) {
+
+                /* Front end dir is a number, which is used as the handle. */
+                p = strrchr(be->frontpath, '/') + 1;
+                handle = strtoul(p, NULL, 0);
+
+                be->blkif = alloc_blkif(be->frontend_id);
+       
+                if (be->blkif == NULL)
+                        goto fail;
+
+               be->blkif->be_id = get_be_id(bepath);
+               
+                /*Insert device specific info*/
+                blk = malloc(sizeof(blkif_info_t));
+               if (!blk) {
+                       DPRINTF("Out of memory - blkif_info_t\n");
+                       goto fail;
+               }
+                er = xs_gather(h, bepath, "params", NULL, &blk->params, NULL);
+                if (er)
+                        goto fail;
+                be->blkif->info = blk;
+               
+                if (deverr) {
+                        /*Dev number was not available, try to set manually*/
+                        pdev = convert_dev_name_to_num(blk->params);
+                        be->pdev = pdev;
+                }
+
+                er = blkif_init(be->blkif, handle, be->pdev, be->readonly);
+
+                if (er != 0) {
+                        DPRINTF("Unable to open device %s\n",blk->params);
+                       goto fail;
+               }
+
+                DPRINTF("[BECHG]: ADDED A NEW BLKIF (%s)\n", bepath);
+        }      
+       /* Supply the information about the device to xenstore */
+       er = xs_printf(h, be->backpath, "sectors", "%lu",
+                       be->blkif->ops->get_size(be->blkif));
+
+       if (er == 0) {
+               DPRINTF("ERROR: Failed writing sectors");
+               goto fail;
+       }
+
+       er = xs_printf(h, be->backpath, "sector-size", "%lu",
+                       be->blkif->ops->get_secsize(be->blkif));
+
+       if (er == 0) {
+               DPRINTF("ERROR: Failed writing sector-size");
+               goto fail;
+       }
+
+       er = xs_printf(h, be->backpath, "info", "%u",
+                       be->blkif->ops->get_info(be->blkif));
+
+       if (er == 0) {
+               DPRINTF("ERROR: Failed writing info");
+               goto fail;
+       }
+
+       be->blkif->state = CONNECTED;
+       DPRINTF("[SETUP] Complete\n\n");
+       goto close;
+       
+fail:
+       if ( (be != NULL) && (be->blkif != NULL) ) 
+               backend_remove(h, be);
+close:
+       if (path)
+               free(path);
+       return;
+}
+
+/**
+ * Xenstore watch callback entry point. This code replaces the hotplug scripts,
+ * and as soon as the xenstore backend driver entries are created, this script
+ * gets called.
+ */
+static void ueblktap_probe(struct xs_handle *h, struct xenbus_watch *w, 
+                          const char *bepath_im)
+{
+       struct backend_info *be = NULL;
+       char *frontend = NULL, *bepath = NULL, *p;
+       int er, len;
+       blkif_t *blkif;
+       
+       
+       bepath = strdup(bepath_im);
+       
+       if (!bepath) {
+               DPRINTF("No path\n");
+               return;
+       }
+       
+       /*
+        *asserts that xenstore structure is always 7 levels deep
+        *e.g. /local/domain/0/backend/vbd/1/2049
+        */
+        len = strsep_len(bepath, '/', 7);
+        if (len < 0) 
+               goto free_be;     
+        bepath[len] = '\0';
+       
+       be = malloc(sizeof(*be));
+       if (!be) {
+               DPRINTF("ERROR: allocating backend structure\n");
+               goto free_be;
+       }
+       memset(be, 0, sizeof(*be));
+       frontend = NULL;
+
+       er = xs_gather(h, bepath,
+                      "frontend-id", "%li", &be->frontend_id,
+                      "frontend", NULL, &frontend,
+                      NULL);
+
+       if (er) {
+               /*
+                *Unable to find frontend entries, 
+                *bus-id is no longer valid
+                */
+               DPRINTF("ERROR: Frontend-id check failed, removing backend: "
+                       "[%s]\n",bepath);
+
+               /**
+                * BE info should already exist, 
+                * free new mem and find old entry
+                */
+               free(be);
+               be = be_lookup_be(bepath);
+               if ( (be != NULL) && (be->blkif != NULL) ) 
+                       backend_remove(h, be);
+               else goto free_be;
+               if (bepath)
+                       free(bepath);
+               return;
+       }
+       
+        /* Are we already tracking this device? */
+        if (be_exists_be(bepath)) {
+               goto free_be;
+       }
+       
+       be->backpath = bepath;
+               be->frontpath = frontend;
+       
+        list_add(&be->list, &belist);
+       
+        DPRINTF("[PROBE]\tADDED NEW DEVICE (%s)\n", bepath);
+       DPRINTF("\tFRONTEND (%s),(%ld)\n", frontend,be->frontend_id);
+       
+       ueblktap_setup(h, bepath);      
+       return;
+       
+ free_be:
+       if (frontend)
+               free(frontend);
+        if (bepath)
+               free(bepath);
+       if (be) 
+               free(be);
+       return;
+}
+
+/**
+ *We set a general watch on the backend vbd directory
+ *ueblktap_probe is called for every update
+ *Our job is to monitor for new entries. As they 
+ *are created, we initalise the state and attach a disk.
+ */
+
+int add_blockdevice_probe_watch(struct xs_handle *h, const char *domname)
+{
+       char *domid, *path;
+       struct xenbus_watch *vbd_watch;
+       int er;
+       
+       domid = get_dom_domid(h, domname);
+
+       DPRINTF("%s: %s\n", 
+               domname, (domid != NULL) ? domid : "[ not found! ]");
+       
+       asprintf(&path, "/local/domain/%s/backend/tap", domid);
+       if (path == NULL) 
+               return -ENOMEM;
+       
+       vbd_watch = (struct xenbus_watch *)malloc(sizeof(struct xenbus_watch));
+       if (!vbd_watch) {
+               DPRINTF("ERROR: unable to malloc vbd_watch [%s]\n", path);
+               return -EINVAL;
+       }       
+       vbd_watch->node     = path;
+       vbd_watch->callback = ueblktap_probe;
+       er = register_xenbus_watch(h, vbd_watch);
+       if (er == 0) {
+               DPRINTF("ERROR: adding vbd probe watch %s\n", path);
+               return -EINVAL;
+       }
+       return 0;
+}
diff --git a/tools/blktap/lib/xs_api.c b/tools/blktap/lib/xs_api.c
new file mode 100644 (file)
index 0000000..44abcf2
--- /dev/null
@@ -0,0 +1,364 @@
+/*
+ * xs_api.c
+ * 
+ * blocktap interface functions to xenstore
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <printf.h>
+#include <string.h>
+#include <err.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <xs.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <poll.h>
+#include "blktaplib.h"
+#include "list.h"
+#include "xs_api.h"
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+static LIST_HEAD(watches);
+#define BASE_DEV_VAL 2048
+
+int xs_gather(struct xs_handle *xs, const char *dir, ...)
+{
+       va_list ap;
+       const char *name;
+       char *path, **e;
+       int ret = 0, num,i;
+       unsigned int len;
+       xs_transaction_t xth;
+
+again:
+       if ( (xth = xs_transaction_start(xs)) == XBT_NULL) {
+               DPRINTF("unable to start xs trasanction\n");
+               ret = ENOMEM;
+               return ret;
+       }
+       
+       va_start(ap, dir);
+       while ( (ret == 0) && (name = va_arg(ap, char *)) != NULL) {
+               const char *fmt = va_arg(ap, char *);
+               void *result = va_arg(ap, void *);
+               char *p;
+               
+               if (asprintf(&path, "%s/%s", dir, name) == -1)
+               {
+                       printf("allocation error in xs_gather!\n");
+                       ret = ENOMEM;
+                       break;
+               }
+               
+               p = xs_read(xs, xth, path, &len);
+               
+               
+               free(path);
+               if (p == NULL) {
+                       ret = ENOENT;
+                       break;
+               }
+               if (fmt) {
+                       if (sscanf(p, fmt, result) == 0)
+                               ret = EINVAL;
+                       free(p);
+               } else
+                       *(char **)result = p;
+       }
+       va_end(ap);
+
+       if (!xs_transaction_end(xs, xth, ret)) {
+               if (ret == 0 && errno == EAGAIN)
+                       goto again;
+                else
+                       ret = errno;
+       }
+
+       return ret;
+}
+
+
+/* Single printf and write: returns -errno or 0. */
+int xs_printf(struct xs_handle *h, const char *dir, const char *node, 
+             const char *fmt, ...)
+{
+        char *buf, *path;
+        va_list ap;
+        int ret;
+       
+        va_start(ap, fmt);
+        ret = vasprintf(&buf, fmt, ap);
+        va_end(ap);
+       
+        asprintf(&path, "%s/%s", dir, node);
+       
+        if ( (path == NULL) || (buf == NULL) )
+               return 0;
+
+        ret = xs_write(h, XBT_NULL, path, buf, strlen(buf)+1);
+       
+        free(buf);
+        free(path);
+       
+        return ret;
+}
+
+
+int xs_exists(struct xs_handle *h, const char *path)
+{
+       char **d;
+       unsigned int num;
+       xs_transaction_t xth;
+       
+       if ( (xth = xs_transaction_start(h)) == XBT_NULL) {
+               printf("unable to start xs trasanction\n");
+               return 0;
+       }       
+       
+       d = xs_directory(h, xth, path, &num);
+       xs_transaction_end(h, xth, 0);
+       if (d == NULL)
+               return 0;
+       free(d);
+       return 1;
+}
+
+
+
+/**
+ * This assumes that the domain name we are looking for is unique. 
+ * Name parameter Domain-0 
+ */
+char *get_dom_domid(struct xs_handle *h, const char *name)
+{
+       char **e, *val, *domid = NULL;
+       unsigned int num, len;
+       int i;
+       char *path;
+       xs_transaction_t xth;
+       
+       if ( (xth = xs_transaction_start(h)) == XBT_NULL) {
+               warn("unable to start xs trasanction\n");
+               return NULL;
+       }
+       
+       e = xs_directory(h, xth, "/local/domain", &num);
+       
+       i = 0;
+       while (i < num) {
+               asprintf(&path, "/local/domain/%s/name", e[i]);
+               val = xs_read(h, xth, path, &len);
+               free(path);
+               if (val == NULL)
+                       continue;
+               
+               if (strcmp(val, name) == 0) {
+                       /* match! */
+                       asprintf(&path, "/local/domain/%s/domid", e[i]);
+                       domid = xs_read(h, xth, path, &len);
+                       free(val);
+                       free(path);
+                       break;
+               }
+               free(val);
+               i++;
+       }
+       xs_transaction_end(h, xth, 0);
+       
+       free(e);
+       return domid;
+}
+
+int convert_dev_name_to_num(char *name) {
+       char *p_sd, *p_hd, *p_xvd, *p_plx, *p, *alpha,*ptr;
+       int majors[10] = {3,22,33,34,56,57,88,89,90,91};
+       int maj,i;
+
+       asprintf(&p_sd,"/dev/sd");
+       asprintf(&p_hd,"/dev/hd");
+       asprintf(&p_xvd,"/dev/xvd");
+       asprintf(&p_plx,"plx");
+       asprintf(&alpha,"abcdefghijklmnop");
+       
+
+       if (strstr(name, p_sd) != NULL) {
+               p = name + strlen(p_sd);
+               for(i = 0, ptr = alpha; i < strlen(alpha); i++) {
+                       if(*ptr == *p)
+                               break;
+                       *ptr++;
+               }
+               *p++;
+               return BASE_DEV_VAL + (16*i) + atoi(p);
+       } else if (strstr(name, p_hd) != NULL) {
+               p = name + strlen(p_hd);
+               for (i = 0, ptr = alpha; i < strlen(alpha); i++) {
+                       if(*ptr == *p) break;
+                       *ptr++;
+               }
+               *p++;
+               return (majors[i/2]*256) + atoi(p);
+
+       } else if (strstr(name, p_xvd) != NULL) {
+               p = name + strlen(p_xvd);
+               for(i = 0, ptr = alpha; i < strlen(alpha); i++) {
+                       if(*ptr == *p) break;
+                       *ptr++;
+               }
+               *p++;
+               return (202*256) + (16*i) + atoi(p);
+
+       } else if (strstr(name, p_plx) != NULL) {
+               p = name + strlen(p_plx);
+               return atoi(p);
+
+       } else {
+               DPRINTF("Unknown device type, setting to default.\n");
+               return BASE_DEV_VAL;
+       }
+       return 0;
+}
+
+/**
+ * A little paranoia: we don't just trust token. 
+ */
+static struct xenbus_watch *find_watch(const char *token)
+{
+       struct xenbus_watch *i, *cmp;
+       
+       cmp = (void *)strtoul(token, NULL, 16);
+       
+       list_for_each_entry(i, &watches, list)
+               if (i == cmp)
+                       return i;
+       return NULL;
+}
+
+/**
+ * Register callback to watch this node. 
+ * like xs_watch, return 0 on failure 
+ */
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+       /* Pointer in ascii is the token. */
+       char token[sizeof(watch) * 2 + 1];
+       int er;
+       
+       sprintf(token, "%lX", (long)watch);
+       if (find_watch(token)) 
+       {
+               DPRINTF("watch collision!\n");
+               return -EINVAL;
+       }
+       
+       er = xs_watch(h, watch->node, token);
+       if (er != 0) {
+               list_add(&watch->list, &watches);
+       } 
+        
+       return er;
+}
+
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+       char token[sizeof(watch) * 2 + 1];
+       int er;
+       
+       sprintf(token, "%lX", (long)watch);
+       if (!find_watch(token))
+       {
+               DPRINTF("no such watch!\n");
+               return -EINVAL;
+       }
+       
+       
+       er = xs_unwatch(h, watch->node, token);
+       list_del(&watch->list);
+       
+       if (er == 0)
+               DPRINTF("XENBUS Failed to release watch %s: %i\n",
+                    watch->node, er);
+       return 0;
+}
+
+/**
+ * Re-register callbacks to all watches. 
+ */
+void reregister_xenbus_watches(struct xs_handle *h)
+{
+       struct xenbus_watch *watch;
+       char token[sizeof(watch) * 2 + 1];
+       
+       list_for_each_entry(watch, &watches, list) {
+               sprintf(token, "%lX", (long)watch);
+               xs_watch(h, watch->node, token);
+       }
+}
+
+/**
+ * based on watch_thread() 
+ */
+int xs_fire_next_watch(struct xs_handle *h)
+{
+       char **res;
+       char *token;
+       char *node = NULL;
+       struct xenbus_watch *w;
+       int er;
+       unsigned int num;
+       
+       res = xs_read_watch(h, &num);
+       if (res == NULL) 
+               return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */
+       
+       node  = res[XS_WATCH_PATH];
+       token = res[XS_WATCH_TOKEN];
+       
+       w = find_watch(token);
+       if (!w)
+       {
+               DPRINTF("unregistered watch fired\n");
+               goto done;
+       }
+       w->callback(h, w, node);
+       
+ done:
+       free(res);
+       return 1;
+}
diff --git a/tools/blktap/lib/xs_api.h b/tools/blktap/lib/xs_api.h
new file mode 100644 (file)
index 0000000..c4183a2
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * xs_api.h
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+struct xenbus_watch
+{
+        struct list_head list;
+        char *node;
+        void (*callback)(struct xs_handle *h, 
+                         struct xenbus_watch *, 
+                         const  char *node);
+};
+
+int xs_gather(struct xs_handle *xs, const char *dir, ...);
+int xs_printf(struct xs_handle *h, const char *dir, const char *node, 
+             const char *fmt, ...);
+int xs_exists(struct xs_handle *h, const char *path);
+char *get_dom_domid(struct xs_handle *h, const char *name);
+int convert_dev_name_to_num(char *name);
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
+void reregister_xenbus_watches(struct xs_handle *h);
+int xs_fire_next_watch(struct xs_handle *h);
index c5ccb6c8af1fd94045649ed58a06227d5b6d15b1..12800815775e8e45015155fd6b80072f682e125e 100644 (file)
@@ -26,6 +26,7 @@ XEN_SCRIPTS += network-route vif-route
 XEN_SCRIPTS += network-nat vif-nat
 XEN_SCRIPTS += block
 XEN_SCRIPTS += block-enbd block-nbd
+XEN_SCRIPTS += blktap
 XEN_SCRIPTS += vtpm vtpm-delete
 XEN_SCRIPTS += xen-hotplug-cleanup
 XEN_SCRIPTS += external-device-migrate
diff --git a/tools/examples/blktap b/tools/examples/blktap
new file mode 100644 (file)
index 0000000..ba9f4ee
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+# Copyright (c) 2005, XenSource Ltd.
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+findCommand "$@"
+
+if [ "$command" == 'add' ]
+then
+  success
+fi
+
+exit 0
index e662015da24fcbe96131a69885ac1fa5b1e62c6b..3a01a2c7ea5188b8ee8d520bb44743bc8338d6ce 100755 (executable)
@@ -7,6 +7,9 @@ PATH=/etc/xen/scripts:$PATH
 claim_lock xenbus_hotplug_global
 
 case "$XENBUS_TYPE" in
+  tap)
+    /etc/xen/scripts/blktap "$ACTION"
+    ;;
   vbd)
     /etc/xen/scripts/block "$ACTION"
     ;;
index 91f0b06107e0401f7eb9ff2e98d185c006f903b1..21c6d8c8fc6cbdf2dcc3d763c77e98e99a18d1f4 100644 (file)
@@ -1,3 +1,4 @@
+SUBSYSTEM=="xen-backend", KERNEL=="tap*", RUN+="/etc/xen/scripts/blktap $env{ACTION}"
 SUBSYSTEM=="xen-backend", KERNEL=="vbd*", RUN+="/etc/xen/scripts/block $env{ACTION}"
 SUBSYSTEM=="xen-backend", KERNEL=="vtpm*", RUN+="/etc/xen/scripts/vtpm $env{ACTION}"
 SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="online", RUN+="$env{script} online"
diff --git a/tools/libaio/COPYING b/tools/libaio/COPYING
new file mode 100644 (file)
index 0000000..c4792dd
--- /dev/null
@@ -0,0 +1,515 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+^L
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+^L
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+\f
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+^L
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+^L
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+^L
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+^L
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+^L
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+^L
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it
+does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+
+Also add information on how to contact you by electronic and paper
+mail.
+
+You should also get your employer (if you work as a programmer) or
+your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/tools/libaio/ChangeLog b/tools/libaio/ChangeLog
new file mode 100644 (file)
index 0000000..ddcf6e3
--- /dev/null
@@ -0,0 +1,43 @@
+0.4.0
+       - remove libredhat-kernel
+       - add rough outline for man pages
+       - make the compiled io_getevents() add the extra parameter and 
+         pass the timeout for updating as per 2.5
+       - fixes for ia64, now works
+       - fixes for x86-64
+       - powerpc support from Gianni Tedesco <gianni@ecsc.co.uk>
+       - disable the NULL check in harness/cases/4.t on ia64: ia64 
+         maps the 0 page and causes this check to fail.
+
+0.3.15
+       - use real syscall interface, but don't break source compatibility 
+         yet (that will happen with 0.4.0)
+
+0.3.13
+       - add test cases
+
+0.3.11
+       - use library versioning of libredhat-kernel to always provide a 
+         fallback
+
+0.3.9
+       - add io_queue_release function
+
+0.3.8
+       - make clean deletes libredhat-kernel.so.1
+       - const struct timespec *
+       - add make srpm target
+
+0.3.7
+       - fix assembly function .types
+       - export io_getevents
+       - fix io_submit function prototype to match the kernel
+       - provide /usr/lib/libredhat-kernel.so link for compilation
+         (do NOT link against libredhat-kernel.so directly)
+       - fix soname to libaio.so.1
+       - fix dummy libredhat-kernel's soname
+       - work around nfs bug
+       - provide and install libredhat-kernel.so.1 stub
+       - Makefile improvements
+       - make sure dummy libredhat-kernel.so only returns -ENOSYS
+
diff --git a/tools/libaio/INSTALL b/tools/libaio/INSTALL
new file mode 100644 (file)
index 0000000..29b9077
--- /dev/null
@@ -0,0 +1,18 @@
+To install the library, execute the command:
+
+       make prefix=`pwd`/usr install
+
+which will install the binaries and header files into the directory 
+usr.  Set prefix=/usr to get them installed into the main system.
+
+Please note:  Do not attempt to install on the system the
+"libredhat-kernel.so" file.  It is a dummy shared library
+provided only for the purpose of being able to bootstrap
+this facility while running on systems without the correct
+libredhat-kernel.so built.  The contents of the included
+libredhat-kernel.so are only stubs; this library is NOT
+functional for anything except the internal purpose of
+linking libaio.so against the provided stubs.  At runtime,
+libaio.so requires a real libredhat-kernel.so library; this
+is provided by the Red Hat kernel RPM packages with async
+I/O functionality.
diff --git a/tools/libaio/Makefile b/tools/libaio/Makefile
new file mode 100644 (file)
index 0000000..06d8775
--- /dev/null
@@ -0,0 +1,40 @@
+NAME=libaio
+SPECFILE=$(NAME).spec
+VERSION=$(shell awk '/Version:/ { print $$2 }' $(SPECFILE))
+RELEASE=$(shell awk '/Release:/ { print $$2 }' $(SPECFILE))
+CVSTAG = $(NAME)_$(subst .,-,$(VERSION))_$(subst .,-,$(RELEASE))
+RPMBUILD=$(shell `which rpmbuild >&/dev/null` && echo "rpmbuild" || echo "rpm")
+
+prefix=/usr
+includedir=$(prefix)/include
+libdir=$(prefix)/lib
+
+default: all
+
+all:
+       @$(MAKE) -C src
+
+install: all
+
+clean:
+       @$(MAKE) -C src clean
+       @$(MAKE) -C harness clean
+
+tag-archive:
+       @cvs -Q tag -F $(CVSTAG)
+
+create-archive: tag-archive
+       @rm -rf /tmp/$(NAME)
+       @cd /tmp; cvs -Q -d $(CVSROOT) export -r$(CVSTAG) $(NAME) || echo GRRRrrrrr -- ignore [export aborted]
+       @mv /tmp/$(NAME) /tmp/$(NAME)-$(VERSION)
+       @cd /tmp; tar czSpf $(NAME)-$(VERSION).tar.gz $(NAME)-$(VERSION)
+       @rm -rf /tmp/$(NAME)-$(VERSION)
+       @cp /tmp/$(NAME)-$(VERSION).tar.gz .
+       @rm -f /tmp/$(NAME)-$(VERSION).tar.gz 
+       @echo " "
+       @echo "The final archive is ./$(NAME)-$(VERSION).tar.gz."
+
+archive: clean tag-archive create-archive
+
+srpm: create-archive
+       $(RPMBUILD) --define "_sourcedir `pwd`" --define "_srcrpmdir `pwd`" --nodeps -bs $(SPECFILE)
diff --git a/tools/libaio/TODO b/tools/libaio/TODO
new file mode 100644 (file)
index 0000000..0a9ac15
--- /dev/null
@@ -0,0 +1,4 @@
+- Write man pages.
+- Make -static links against libaio work.
+- Fallback on userspace if the kernel calls return -ENOSYS.
+
diff --git a/tools/libaio/harness/Makefile b/tools/libaio/harness/Makefile
new file mode 100644 (file)
index 0000000..d2483fd
--- /dev/null
@@ -0,0 +1,37 @@
+# foo.
+TEST_SRCS:=$(shell find cases/ -name \*.t | sort -n -t/ -k2)
+PROGS:=$(patsubst %.t,%.p,$(TEST_SRCS))
+HARNESS_SRCS:=main.c
+# io_queue.c
+
+CFLAGS=-Wall -Werror -g -O -laio
+#-lpthread -lrt
+
+all: $(PROGS)
+
+$(PROGS): %.p: %.t $(HARNESS_SRCS)
+       $(CC) $(CFLAGS) -DTEST_NAME=\"$<\" -o $@ main.c
+
+clean:
+       rm -f $(PROGS) *.o runtests.out rofile wofile rwfile
+
+.PHONY:
+
+testdir/rofile: .PHONY
+       rm -f $@
+       echo "test" >$@
+       chmod 400 $@
+
+testdir/wofile: .PHONY
+       rm -f $@
+       echo "test" >$@
+       chmod 200 $@
+
+testdir/rwfile: .PHONY
+       rm -f $@
+       echo "test" >$@
+       chmod 600 $@
+
+check: $(PROGS) testdir/rofile testdir/rwfile testdir/wofile
+       ./runtests.sh $(PROGS)
+
diff --git a/tools/libaio/harness/README b/tools/libaio/harness/README
new file mode 100644 (file)
index 0000000..5557370
--- /dev/null
@@ -0,0 +1,19 @@
+Notes on running this test suite:
+
+To run the test suite, run "make check".  All test cases should pass 
+and there should be 0 fails.
+
+Several of the test cases require a directory on the filesystem under 
+test for the creation of test files, as well as the generation of 
+error conditions.  The test cases assume the directories (or symlinks 
+to directories) are as follows:
+
+       testdir/
+               - used for general read/write test cases.  Must have at 
+                 least as much free space as the machine has RAM (up 
+                 to 768MB).
+       testdir.enospc/
+               - a filesystem that has space for writing 8KB out, but 
+                 fails with -ENOSPC beyond 8KB.
+       testdir.ext2/
+               - must be an ext2 filesystem.
diff --git a/tools/libaio/harness/attic/0.t b/tools/libaio/harness/attic/0.t
new file mode 100644 (file)
index 0000000..033e62c
--- /dev/null
@@ -0,0 +1,9 @@
+/* 0.t
+       Test harness check: okay.
+*/
+int test_main(void)
+{
+       printf("test_main: okay\n");
+       return 0;
+}
+
diff --git a/tools/libaio/harness/attic/1.t b/tools/libaio/harness/attic/1.t
new file mode 100644 (file)
index 0000000..799ffd1
--- /dev/null
@@ -0,0 +1,9 @@
+/* 1.t
+       Test harness check: fail.
+*/
+int test_main(void)
+{
+       printf("test_main: fail\n");
+       return 1;
+}
+
diff --git a/tools/libaio/harness/cases/10.t b/tools/libaio/harness/cases/10.t
new file mode 100644 (file)
index 0000000..9d3beb2
--- /dev/null
@@ -0,0 +1,53 @@
+/* 10.t - uses testdir.enospc/rwfile
+- Check results on out-of-space and out-of-quota. (10.t)
+        - write that fills filesystem but does not go over should succeed
+        - write that fills filesystem and goes over should be partial
+        - write to full filesystem should return -ENOSPC
+        - read beyond end of file after ENOSPC should return 0
+*/
+#include "aio_setup.h"
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+int test_main(void)
+{
+/* Note: changing either of these requires updating the ext2-enospc.img
+ * filesystem image.  Also, if SIZE is less than PAGE_SIZE, problems 
+ * crop up due to ext2's preallocation.
+ */
+#define LIMIT  65536
+#define SIZE   65536
+       char *buf;
+       int rwfd;
+       int status = 0, res;
+
+       rwfd = open("testdir.enospc/rwfile", O_RDWR|O_CREAT|O_TRUNC, 0600);
+                                                       assert(rwfd != -1);
+       res = ftruncate(rwfd, 0);                       assert(res == 0);
+       buf = malloc(SIZE);                             assert(buf != NULL);
+       memset(buf, 0, SIZE);
+
+
+       status |= attempt_rw(rwfd, buf, SIZE,   LIMIT-SIZE, WRITE, SIZE);
+       status |= attempt_rw(rwfd, buf, SIZE,   LIMIT-SIZE,  READ, SIZE);
+
+       status |= attempt_rw(rwfd, buf, SIZE,        LIMIT, WRITE, -ENOSPC);
+       status |= attempt_rw(rwfd, buf, SIZE,        LIMIT,  READ,       0);
+
+       res = ftruncate(rwfd, 0);                       assert(res == 0);
+
+       status |= attempt_rw(rwfd, buf, SIZE, 1+LIMIT-SIZE, WRITE, SIZE-1);
+       status |= attempt_rw(rwfd, buf, SIZE, 1+LIMIT-SIZE,  READ, SIZE-1);
+       status |= attempt_rw(rwfd, buf, SIZE,        LIMIT,  READ,      0);
+
+       status |= attempt_rw(rwfd, buf, SIZE,        LIMIT, WRITE, -ENOSPC);
+       status |= attempt_rw(rwfd, buf, SIZE,        LIMIT,  READ,       0);
+       status |= attempt_rw(rwfd, buf,    0,        LIMIT, WRITE,       0);
+
+       res = close(rwfd);                              assert(res == 0);
+       res = unlink("testdir.enospc/rwfile");          assert(res == 0);
+       return status;
+}
+
diff --git a/tools/libaio/harness/cases/11.t b/tools/libaio/harness/cases/11.t
new file mode 100644 (file)
index 0000000..efcf6d4
--- /dev/null
@@ -0,0 +1,39 @@
+/* 11.t - uses testdir/rwfile
+- repeated read / write of same page (to check accounting) (11.t)
+*/
+#include "aio_setup.h"
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+int test_main(void)
+{
+#define COUNT  1000000
+#define SIZE   256
+       char *buf;
+       int rwfd;
+       int status = 0;
+       int i;
+
+       rwfd = open("testdir/rwfile", O_RDWR|O_CREAT|O_TRUNC, 0600);
+                                                       assert(rwfd != -1);
+       buf = malloc(SIZE);                             assert(buf != NULL);
+       memset(buf, 0, SIZE);
+
+       for (i=0; i<COUNT; i++) {
+               status |= attempt_rw(rwfd, buf, SIZE, 0, WRITE_SILENT, SIZE);
+               if (status)
+                       break;
+       }
+       printf("completed %d out of %d writes\n", i, COUNT);
+       for (i=0; i<COUNT; i++) {
+               status |= attempt_rw(rwfd, buf, SIZE, 0, READ_SILENT, SIZE);
+               if (status)
+                       break;
+       }
+       printf("completed %d out of %d reads\n", i, COUNT);
+
+       return status;
+}
+
diff --git a/tools/libaio/harness/cases/12.t b/tools/libaio/harness/cases/12.t
new file mode 100644 (file)
index 0000000..3499204
--- /dev/null
@@ -0,0 +1,49 @@
+/* 12.t
+- ioctx access across fork() (12.t)
+ */
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include "aio_setup.h"
+
+void test_child(void)
+{
+       int res;
+       res = attempt_io_submit(io_ctx, 0, NULL, -EINVAL);
+       fflush(stdout);
+       _exit(res);
+}
+
+int test_main(void)
+{
+       int res, status;
+       pid_t pid;
+
+       if (attempt_io_submit(io_ctx, 0, NULL, 0))
+               return 1;
+
+       sigblock(sigmask(SIGCHLD) | siggetmask());
+       fflush(NULL);
+       pid = fork();                           assert(pid != -1);
+
+       if (pid == 0)
+               test_child();
+
+       res = waitpid(pid, &status, 0);
+
+       if (WIFEXITED(status)) {
+               int failed = (WEXITSTATUS(status) != 0);
+               printf("child exited with status %d%s\n", WEXITSTATUS(status),
+                       failed ? " -- FAILED" : "");
+               return failed;
+       }
+
+       /* anything else: failed */
+       if (WIFSIGNALED(status))
+               printf("child killed by signal %d -- FAILED.\n",
+                       WTERMSIG(status));
+
+       return 1;
+}
diff --git a/tools/libaio/harness/cases/13.t b/tools/libaio/harness/cases/13.t
new file mode 100644 (file)
index 0000000..5f18005
--- /dev/null
@@ -0,0 +1,66 @@
+/* 13.t - uses testdir/rwfile
+- Submit multiple writes larger than aio-max-size (deadlocks on older
+  aio code)
+*/
+#include "aio_setup.h"
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+int test_main(void)
+{
+#define SIZE   (1024 * 1024)
+#define IOS    8
+       struct iocb     iocbs[IOS];
+       struct iocb     *iocb_list[IOS];
+       char *bufs[IOS];
+       int rwfd;
+       int status = 0, res;
+       int i;
+
+       rwfd = open("testdir/rwfile", O_RDWR|O_CREAT|O_TRUNC, 0600);
+                                                       assert(rwfd != -1);
+       res = ftruncate(rwfd, 0);                       assert(res == 0);
+
+       for (i=0; i<IOS; i++) {
+               bufs[i] = malloc(SIZE);
+               assert(bufs[i] != NULL);
+               memset(bufs[i], 0, SIZE);
+
+               io_prep_pwrite(&iocbs[i], rwfd, bufs[i], SIZE, i * SIZE);
+               iocb_list[i] = &iocbs[i];
+       }
+
+       status |= attempt_io_submit(io_ctx, IOS, iocb_list, IOS);
+
+       for (i=0; i<IOS; i++) {
+               struct timespec ts = { tv_sec: 30, tv_nsec: 0 };
+               struct io_event event;
+               struct iocb *iocb;
+
+               res = io_getevents(io_ctx, 0, 1, &event, &ts);
+               if (res != 1) {
+                       status |= 1;
+                       printf("io_getevents failed [%d] with res=%d [%s]\n",
+                               i, res, (res < 0) ? strerror(-res) : "okay");
+                       break;
+               }
+
+               if (event.res != SIZE)
+                       status |= 1;
+
+               iocb = (void *)event.obj;
+               printf("event[%d]: write[%d] %s, returned: %ld [%s]\n",
+                       i, (int)(iocb - &iocbs[0]),
+                       (event.res != SIZE) ? "failed" : "okay",
+                       (long)event.res,
+                       (event.res < 0) ? strerror(-event.res) : "okay"
+                       );
+       }
+
+       res = ftruncate(rwfd, 0);                       assert(res == 0);
+       res = close(rwfd);                              assert(res == 0);
+       return status;
+}
+
diff --git a/tools/libaio/harness/cases/14.t b/tools/libaio/harness/cases/14.t
new file mode 100644 (file)
index 0000000..514622b
--- /dev/null
@@ -0,0 +1,90 @@
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include "aio_setup.h"
+#include <sys/mman.h>
+
+#define SIZE 768*1024*1024
+
+//just submit an I/O
+
+int test_child(void)
+{
+        char *buf;
+        int rwfd;
+        int res;
+        long size;
+        struct iocb iocb;
+        struct iocb *iocbs[] = { &iocb };
+        int loop = 10;
+        int i;
+
+       aio_setup(1024);
+
+        size = SIZE;
+
+        printf("size = %ld\n", size);
+
+        rwfd = open("testdir/rwfile", O_RDWR);          assert(rwfd != 
+-1);
+        res = ftruncate(rwfd, 0);                       assert(res == 0);
+        buf = malloc(size);                             assert(buf != 
+NULL);
+
+        for(i=0;i<loop;i++) {
+
+                switch(i%2) {
+                case 0:
+                        io_prep_pwrite(&iocb, rwfd, buf, size, 0);
+                        break;
+                case 1:
+                        io_prep_pread(&iocb, rwfd, buf, size, 0);
+                }
+
+                res = io_submit(io_ctx, 1, iocbs);
+                if (res != 1) {
+                        printf("child: submit: io_submit res=%d [%s]\n", res, 
+strerror(-res));
+                        _exit(1);
+                }
+        }
+
+        res = ftruncate(rwfd, 0);                       assert(res == 0);
+
+        _exit(0);
+}
+
+/* from 12.t */
+int test_main(void)
+{
+       int res, status;
+       pid_t pid;
+
+       if (attempt_io_submit(io_ctx, 0, NULL, 0))
+               return 1;
+
+       sigblock(sigmask(SIGCHLD) | siggetmask());
+       fflush(NULL);
+       pid = fork();                           assert(pid != -1);
+
+       if (pid == 0)
+               test_child();
+
+       res = waitpid(pid, &status, 0);
+
+       if (WIFEXITED(status)) {
+               int failed = (WEXITSTATUS(status) != 0);
+               printf("child exited with status %d%s\n", WEXITSTATUS(status),
+                       failed ? " -- FAILED" : "");
+               return failed;
+       }
+
+       /* anything else: failed */
+       if (WIFSIGNALED(status))
+               printf("child killed by signal %d -- FAILED.\n",
+                       WTERMSIG(status));
+
+       return 1;
+}
diff --git a/tools/libaio/harness/cases/2.t b/tools/libaio/harness/cases/2.t
new file mode 100644 (file)
index 0000000..3a0212d
--- /dev/null
@@ -0,0 +1,41 @@
+/* 2.t
+- io_setup (#2)
+        - with invalid context pointer
+        - with maxevents <= 0
+        - with an already initialized ctxp
+*/
+
+int attempt(int n, io_context_t *ctxp, int expect)
+{
+       int res;
+
+       printf("expect %3d: io_setup(%5d, %p) = ", expect, n, ctxp);
+       fflush(stdout);
+       res = io_setup(n, ctxp);
+       printf("%3d [%s]%s\n", res, strerror(-res), 
+               (res != expect) ? " -- FAILED" : "");
+       if (res != expect)
+               return 1;
+
+       return 0;
+}
+
+int test_main(void)
+{
+       io_context_t    ctx;
+       int     status = 0;
+
+       ctx = NULL;
+       status |= attempt(-1000, KERNEL_RW_POINTER, -EFAULT);
+       status |= attempt( 1000, KERNEL_RW_POINTER, -EFAULT);
+       status |= attempt(    0, KERNEL_RW_POINTER, -EFAULT);
+       status |= attempt(-1000, &ctx, -EINVAL);
+       status |= attempt(   -1, &ctx, -EINVAL);
+       status |= attempt(    0, &ctx, -EINVAL);
+       assert(ctx == NULL);
+       status |= attempt(    1, &ctx, 0);
+       status |= attempt(    1, &ctx, -EINVAL);
+
+       return status;
+}
+
diff --git a/tools/libaio/harness/cases/3.t b/tools/libaio/harness/cases/3.t
new file mode 100644 (file)
index 0000000..7773d80
--- /dev/null
@@ -0,0 +1,25 @@
+/* 3.t
+- io_submit/io_getevents with invalid addresses (3.t)
+
+*/
+#include "aio_setup.h"
+
+int test_main(void)
+{
+       struct iocb a, b;
+       struct iocb *good_ios[] = { &a, &b };
+       struct iocb *bad1_ios[] = { NULL, &b };
+       struct iocb *bad2_ios[] = { KERNEL_RW_POINTER, &a };
+       int     status = 0;
+
+       status |= attempt_io_submit(BAD_CTX, 1,   good_ios, -EINVAL);
+       status |= attempt_io_submit( io_ctx, 0,   good_ios,       0);
+       status |= attempt_io_submit( io_ctx, 1,       NULL, -EFAULT);
+       status |= attempt_io_submit( io_ctx, 1, (void *)-1, -EFAULT);
+       status |= attempt_io_submit( io_ctx, 2,   bad1_ios, -EFAULT);
+       status |= attempt_io_submit( io_ctx, 2,   bad2_ios, -EFAULT);
+       status |= attempt_io_submit( io_ctx, -1,  good_ios, -EINVAL);
+
+       return status;
+}
+
diff --git a/tools/libaio/harness/cases/4.t b/tools/libaio/harness/cases/4.t
new file mode 100644 (file)
index 0000000..972b4f2
--- /dev/null
@@ -0,0 +1,72 @@
+/* 4.t
+- read of descriptor without read permission (4.t)
+- write to descriptor without write permission (4.t)
+- check that O_APPEND writes actually append
+
+*/
+#include "aio_setup.h"
+
+#define SIZE   512
+#define READ   'r'
+#define WRITE  'w'
+int attempt(int fd, void *buf, int count, long long pos, int rw, int expect)
+{
+       struct iocb iocb;
+       int res;
+
+       switch(rw) {
+       case READ:      io_prep_pread (&iocb, fd, buf, count, pos); break;
+       case WRITE:     io_prep_pwrite(&iocb, fd, buf, count, pos); break;
+       }
+
+       printf("expect %3d: (%c), res = ", expect, rw);
+       fflush(stdout);
+       res = sync_submit(&iocb);
+       printf("%3d [%s]%s\n", res, (res <= 0) ? strerror(-res) : "Success",
+               (res != expect) ? " -- FAILED" : "");
+       if (res != expect)
+               return 1;
+
+       return 0;
+}
+
+int test_main(void)
+{
+       char buf[SIZE];
+       int rofd, wofd, rwfd;
+       int     status = 0, res;
+
+       memset(buf, 0, SIZE);
+
+       rofd = open("testdir/rofile", O_RDONLY);        assert(rofd != -1);
+       wofd = open("testdir/wofile", O_WRONLY);        assert(wofd != -1);
+       rwfd = open("testdir/rwfile", O_RDWR);          assert(rwfd != -1);
+
+       status |= attempt(rofd, buf, SIZE,  0, WRITE, -EBADF);
+       status |= attempt(wofd, buf, SIZE,  0,  READ, -EBADF);
+       status |= attempt(rwfd, buf, SIZE,  0, WRITE, SIZE);
+       status |= attempt(rwfd, buf, SIZE,  0,  READ, SIZE);
+       status |= attempt(rwfd, buf, SIZE, -1,  READ, -EINVAL);
+       status |= attempt(rwfd, buf, SIZE, -1, WRITE, -EINVAL);
+
+       rwfd = open("testdir/rwfile", O_RDWR|O_APPEND); assert(rwfd != -1);
+       res = ftruncate(rwfd, 0);                       assert(res == 0);
+       status |= attempt(rwfd, buf,    SIZE, 0,  READ, 0);
+       status |= attempt(rwfd, "1234",    4, 0, WRITE, 4);
+       status |= attempt(rwfd, "5678",    4, 0, WRITE, 4);
+       memset(buf, 0, SIZE);
+       status |= attempt(rwfd,    buf, SIZE, 0,  READ, 8);
+       printf("read after append: [%s]\n", buf);
+       assert(memcmp(buf, "12345678", 8) == 0);
+
+       status |= attempt(rwfd, KERNEL_RW_POINTER, SIZE, 0,  READ, -EFAULT);
+       status |= attempt(rwfd, KERNEL_RW_POINTER, SIZE, 0, WRITE, -EFAULT);
+
+       /* Some architectures map the 0 page.  Ugh. */
+#if !defined(__ia64__)
+       status |= attempt(rwfd,              NULL, SIZE, 0, WRITE, -EFAULT);
+#endif
+
+       return status;
+}
+
diff --git a/tools/libaio/harness/cases/5.t b/tools/libaio/harness/cases/5.t
new file mode 100644 (file)
index 0000000..7669fd7
--- /dev/null
@@ -0,0 +1,47 @@
+/* 5.t
+- Write from a mmap() of the same file. (5.t)
+*/
+#include "aio_setup.h"
+#include <sys/mman.h>
+
+int test_main(void)
+{
+       int page_size = getpagesize();
+#define SIZE   512
+       char *buf;
+       int rwfd;
+       int     status = 0, res;
+
+       rwfd = open("testdir/rwfile", O_RDWR);          assert(rwfd != -1);
+       res = ftruncate(rwfd, 512);                     assert(res == 0);
+
+       buf = mmap(0, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, rwfd, 0);
+       assert(buf != (char *)-1);
+
+       status |= attempt_rw(rwfd, buf, SIZE,  0, WRITE, SIZE);
+       status |= attempt_rw(rwfd, buf, SIZE,  0,  READ, SIZE);
+
+       res = munmap(buf, page_size);                   assert(res == 0);
+       buf = mmap(0, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, rwfd, 0);
+       assert(buf != (char *)-1);
+
+       status |= attempt_rw(rwfd, buf, SIZE,  0,  READ, SIZE);
+       status |= attempt_rw(rwfd, buf, SIZE,  0, WRITE, SIZE);
+
+       res = munmap(buf, page_size);                   assert(res == 0);
+       buf = mmap(0, page_size, PROT_READ, MAP_SHARED, rwfd, 0);
+       assert(buf != (char *)-1);
+
+       status |= attempt_rw(rwfd, buf, SIZE,  0, WRITE, SIZE);
+       status |= attempt_rw(rwfd, buf, SIZE,  0,  READ, -EFAULT);
+
+       res = munmap(buf, page_size);                   assert(res == 0);
+       buf = mmap(0, page_size, PROT_WRITE, MAP_SHARED, rwfd, 0);
+       assert(buf != (char *)-1);
+
+       status |= attempt_rw(rwfd, buf, SIZE,  0,  READ, SIZE);
+       status |= attempt_rw(rwfd, buf, SIZE,  0, WRITE, -EFAULT);
+
+       return status;
+}
+
diff --git a/tools/libaio/harness/cases/6.t b/tools/libaio/harness/cases/6.t
new file mode 100644 (file)
index 0000000..cea4b01
--- /dev/null
@@ -0,0 +1,57 @@
+/* 6.t
+- huge reads (pinned pages) (6.t)
+- huge writes (6.t)
+*/
+#include "aio_setup.h"
+#include <sys/mman.h>
+
+long getmemsize(void)
+{
+       FILE *f = fopen("/proc/meminfo", "r");
+       long size;
+       int gotit = 0;
+       char str[256];
+
+       assert(f != NULL);
+       while (NULL != fgets(str, 255, f)) {
+               str[255] = 0;
+               if (0 == memcmp(str, "MemTotal:", 9)) {
+                       if (1 == sscanf(str + 9, "%ld", &size)) {
+                               gotit = 1;
+                               break;
+                       }
+               }
+       }
+       fclose(f);
+
+       assert(gotit != 0);
+       return size;
+}
+
+int test_main(void)
+{
+       char *buf;
+       int rwfd;
+       int status = 0, res;
+       long size;
+
+       size = getmemsize();
+       printf("size = %ld\n", size);
+       assert(size >= (16 * 1024));
+       if (size > (768 * 1024))
+               size = 768 * 1024;
+       size *= 1024;
+
+       rwfd = open("testdir/rwfile", O_RDWR);          assert(rwfd != -1);
+       res = ftruncate(rwfd, 0);                       assert(res == 0);
+       buf = malloc(size);                             assert(buf != NULL);
+
+       //memset(buf, 0, size);
+       status |= attempt_rw(rwfd, buf, size,  0, WRITE, size);
+       status |= attempt_rw(rwfd, buf, size,  0,  READ, size);
+
+       //res = ftruncate(rwfd, 0);                     assert(res == 0);
+
+       return status;
+}
+
diff --git a/tools/libaio/harness/cases/7.t b/tools/libaio/harness/cases/7.t
new file mode 100644 (file)
index 0000000..d2d6cbc
--- /dev/null
@@ -0,0 +1,27 @@
+/* 7.t
+- Write overlapping the file size rlimit boundary: should be a short
+  write. (7.t)
+- Write at the file size rlimit boundary: should give EFBIG.  (I think
+  the spec requires that you do NOT deliver SIGXFSZ in this case, where
+  you would do so for sync IO.) (7.t)
+- Special case: a write of zero bytes at or beyond the file size rlimit
+  boundary must return success. (7.t)
+*/
+
+#include <sys/resource.h>
+
+void SET_RLIMIT(long long limit)
+{
+       struct rlimit rlim;
+       int res;
+
+       rlim.rlim_cur = limit;                  assert(rlim.rlim_cur == limit);
+       rlim.rlim_max = limit;                  assert(rlim.rlim_max == limit);
+
+       res = setrlimit(RLIMIT_FSIZE, &rlim);   assert(res == 0);
+}
+
+#define LIMIT  8192
+#define FILENAME       "testdir/rwfile"
+
+#include "common-7-8.h"
diff --git a/tools/libaio/harness/cases/8.t b/tools/libaio/harness/cases/8.t
new file mode 100644 (file)
index 0000000..8a3d83e
--- /dev/null
@@ -0,0 +1,49 @@
+/* 8.t
+- Ditto for the above three tests at the offset maximum (largest
+  possible ext2/3 file size.) (8.t)
+ */
+#include <sys/vfs.h>
+
+#define EXT2_OLD_SUPER_MAGIC   0xEF51
+#define EXT2_SUPER_MAGIC       0xEF53
+
+long long get_fs_limit(int fd)
+{
+       struct statfs s;
+       int res;
+       long long lim = 0;
+
+       res = fstatfs(fd, &s);          assert(res == 0);
+
+       switch(s.f_type) {
+       case EXT2_OLD_SUPER_MAGIC:
+       case EXT2_SUPER_MAGIC:
+#if 0
+       {
+               long long tmp;
+               tmp = s.f_bsize / 4;
+               /* 12 direct + indirect block + dind + tind */
+               lim = 12 + tmp + tmp * tmp + tmp * tmp * tmp;
+               lim *= s.f_bsize;
+               printf("limit(%ld) = %Ld\n", (long)s.f_bsize, lim);
+       }
+#endif
+               switch(s.f_bsize) {
+               case 4096: lim = 2199023251456; break;
+               default:
+                       printf("unknown ext2 blocksize %ld\n", (long)s.f_bsize);
+                       exit(3);
+               }
+               break;
+       default:
+               printf("unknown filesystem 0x%08lx\n", (long)s.f_type);
+               exit(3);
+       }
+       return lim;
+}
+
+#define SET_RLIMIT(x)  do ; while (0)
+#define LIMIT          get_fs_limit(rwfd)
+#define FILENAME       "testdir.ext2/rwfile"
+
+#include "common-7-8.h"
diff --git a/tools/libaio/harness/cases/aio_setup.h b/tools/libaio/harness/cases/aio_setup.h
new file mode 100644 (file)
index 0000000..37c9618
--- /dev/null
@@ -0,0 +1,98 @@
+io_context_t   io_ctx;
+#define BAD_CTX        ((io_context_t)-1)
+
+void aio_setup(int n)
+{
+       int res = io_queue_init(n, &io_ctx);
+       if (res != 0) {
+               printf("io_queue_setup(%d) returned %d (%s)\n",
+                       n, res, strerror(-res));
+               exit(3);
+       }
+}
+
+int attempt_io_submit(io_context_t ctx, long nr, struct iocb *ios[], int expect)
+{
+       int res;
+
+       printf("expect %3d: io_submit(%10p, %3ld, %10p) = ", expect, ctx, nr, ios);
+       fflush(stdout);
+       res = io_submit(ctx, nr, ios);
+       printf("%3d [%s]%s\n", res, (res <= 0) ? strerror(-res) : "",
+               (res != expect) ? " -- FAILED" : "");
+       if (res != expect)
+               return 1;
+
+       return 0;
+}
+
+int sync_submit(struct iocb *iocb)
+{
+       struct io_event event;
+       struct iocb *iocbs[] = { iocb };
+       int res;
+
+       /* 30 second timeout should be enough */
+       struct timespec ts;
+       ts.tv_sec = 30;
+       ts.tv_nsec = 0;
+
+       res = io_submit(io_ctx, 1, iocbs);
+       if (res != 1) {
+               printf("sync_submit: io_submit res=%d [%s]\n", res, strerror(-res));
+               return res;
+       }
+
+       res = io_getevents(io_ctx, 0, 1, &event, &ts);
+       if (res != 1) {
+               printf("sync_submit: io_getevents res=%d [%s]\n", res, strerror(-res));
+               return res;
+       }
+       return event.res;
+}
+
+#define SETUP  aio_setup(1024)
+
+
+#define READ           'r'
+#define WRITE          'w'
+#define READ_SILENT    'R'
+#define WRITE_SILENT   'W'
+int attempt_rw(int fd, void *buf, int count, long long pos, int rw, int expect)
+{
+       struct iocb iocb;
+       int res;
+       int silent = 0;
+
+       switch(rw) {
+       case READ_SILENT:
+               silent = 1;
+       case READ:
+               io_prep_pread (&iocb, fd, buf, count, pos);
+               break;
+       case WRITE_SILENT:
+               silent = 1;
+       case WRITE:
+               io_prep_pwrite(&iocb, fd, buf, count, pos);
+               break;
+       }
+
+       if (!silent) {
+               printf("expect %5d: (%c), res = ", expect, rw);
+               fflush(stdout);
+       }
+       res = sync_submit(&iocb);
+       if (!silent || res != expect) {
+               if (silent)
+                       printf("expect %5d: (%c), res = ", expect, rw);
+               printf("%5d [%s]%s\n", res,
+                       (res <= 0) ? strerror(-res) : "Success",
+                       (res != expect) ? " -- FAILED" : "");
+       }
+
+       if (res != expect)
+               return 1;
+
+       return 0;
+}
+
diff --git a/tools/libaio/harness/cases/common-7-8.h b/tools/libaio/harness/cases/common-7-8.h
new file mode 100644 (file)
index 0000000..3ec2bb4
--- /dev/null
@@ -0,0 +1,37 @@
+/* common-7-8.h
+*/
+#include "aio_setup.h"
+
+#include <unistd.h>
+
+#define SIZE   512
+
+int test_main(void)
+{
+       char *buf;
+       int rwfd;
+       int status = 0, res;
+       long long limit;
+
+       rwfd = open(FILENAME, O_RDWR);          assert(rwfd != -1);
+       res = ftruncate(rwfd, 0);                       assert(res == 0);
+       buf = malloc(SIZE);                             assert(buf != NULL);
+       memset(buf, 0, SIZE);
+
+       limit = LIMIT;
+
+       SET_RLIMIT(limit);
+
+       status |= attempt_rw(rwfd, buf, SIZE,   limit-SIZE, WRITE, SIZE);
+       status |= attempt_rw(rwfd, buf, SIZE,   limit-SIZE,  READ, SIZE);
+
+       status |= attempt_rw(rwfd, buf, SIZE, 1+limit-SIZE, WRITE, SIZE-1);
+       status |= attempt_rw(rwfd, buf, SIZE, 1+limit-SIZE,  READ, SIZE-1);
+
+       status |= attempt_rw(rwfd, buf, SIZE,        limit, WRITE, -EFBIG);
+       status |= attempt_rw(rwfd, buf, SIZE,        limit,  READ,      0);
+       status |= attempt_rw(rwfd, buf,    0,        limit, WRITE,      0);
+
+       return status;
+}
+
diff --git a/tools/libaio/harness/main.c b/tools/libaio/harness/main.c
new file mode 100644 (file)
index 0000000..74b2764
--- /dev/null
@@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <libaio.h>
+
+#if defined(__i386__)
+#define KERNEL_RW_POINTER      ((void *)0xc0010000)
+#else
+//#warning Not really sure where kernel memory is.  Guessing.
+#define KERNEL_RW_POINTER      ((void *)0xffffffffc0010000)
+#endif
+
+
+char test_name[] = TEST_NAME;
+
+#include TEST_NAME
+
+int main(void)
+{
+       int res;
+
+#if defined(SETUP)
+       SETUP;
+#endif
+
+       res = test_main();
+       printf("test %s completed %s.\n", test_name, 
+               res ? "FAILED" : "PASSED"
+               );
+       fflush(stdout);
+       return res ? 1 : 0;
+}
diff --git a/tools/libaio/harness/runtests.sh b/tools/libaio/harness/runtests.sh
new file mode 100644 (file)
index 0000000..d763d88
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+passes=0
+fails=0
+
+echo "Test run starting at" `date`
+
+while [ $# -ge 1 ] ; do
+       this_test=$1
+       shift
+       echo "Starting $this_test"
+       $this_test 2>&1
+       res=$?
+       if [ $res -eq 0 ] ; then str="" ; passes=$[passes + 1] ; else str=" -- FAILED" ; fails=$[fails + 1] ; fi
+       echo "Completed $this_test with $res$str".
+done
+
+echo "Pass: $passes  Fail: $fails"
+echo "Test run complete at" `date`
diff --git a/tools/libaio/libaio.spec b/tools/libaio/libaio.spec
new file mode 100644 (file)
index 0000000..1f16c91
--- /dev/null
@@ -0,0 +1,177 @@
+Name: libaio
+Version: 0.3.104
+Release: 1
+Summary: Linux-native asynchronous I/O access library
+Copyright: LGPL
+Group:  System Environment/Libraries
+Source: %{name}-%{version}.tar.gz
+BuildRoot: %{_tmppath}/%{name}-root
+# Fix ExclusiveArch as we implement this functionality on more architectures
+ExclusiveArch: i386 x86_64 ia64 s390 s390x ppc ppc64 ppc64pseries ppc64iseries alpha alphaev6
+
+%description
+The Linux-native asynchronous I/O facility ("async I/O", or "aio") has a
+richer API and capability set than the simple POSIX async I/O facility.
+This library, libaio, provides the Linux-native API for async I/O.
+The POSIX async I/O facility requires this library in order to provide
+kernel-accelerated async I/O capabilities, as do applications which
+require the Linux-native async I/O API.
+
+%package devel
+Summary: Development files for Linux-native asynchronous I/O access
+Group: Development/System
+Requires: libaio
+Provides: libaio.so.1
+
+%description devel
+This package provides header files to include and libraries to link with
+for the Linux-native asynchronous I/O facility ("async I/O", or "aio").
+
+%prep
+%setup
+
+%build
+make
+
+%install
+[ "$RPM_BUILD_ROOT" != "/" ] && rm -rf $RPM_BUILD_ROOT
+
+make install prefix=$RPM_BUILD_ROOT/usr \
+ libdir=$RPM_BUILD_ROOT/%{_libdir} \
+ root=$RPM_BUILD_ROOT
+
+%clean
+[ "$RPM_BUILD_ROOT" != "/" ] && rm -rf $RPM_BUILD_ROOT
+
+%post -p /sbin/ldconfig
+
+%postun -p /sbin/ldconfig
+
+%files
+%defattr(-,root,root)
+%attr(0755,root,root) %{_libdir}/libaio.so.*
+%doc COPYING TODO
+
+%files devel
+%defattr(-,root,root)
+%attr(0644,root,root) %{_includedir}/*
+%attr(0755,root,root) %{_libdir}/libaio.so
+%attr(0644,root,root) %{_libdir}/libaio.a
+
+%changelog
+* Fri Apr  1 2005 Jeff Moyer <jmoyer@redhat.com> - 0.3.104-1
+- Add Alpha architecture support.  (Sergey Tikhonov <tsv@solvo.ru>)
+
+* Tue Jan 25 2005 Jeff Moyer <jmoyer@redhat.com> - 0.3.103-1
+- Fix SONAME breakage.  In changing file names around, I also changed the 
+  SONAME, which is a no no.
+
+* Thu Oct 14 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.102-1
+- S390 asm had a bug; I forgot to update the clobber list.  Lucky for me,
+  newer compilers complain about such things.
+- Also update the s390 asm to look more like the new kernel variants.
+
+* Wed Oct 13 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.101-1
+- Revert syscall return values to be -ERRNO.  This was an inadvertant bug
+  introduced when clobber lists changed.
+- add ppc64pseries and ppc64iseries to exclusivearch
+
+* Tue Sep 14 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.100-1
+- Switch around the tests for _PPC_ and _powerpc64_ so that the ppc64 
+  platforms get the right padding.
+
+* Wed Jul 14 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.99-4
+- Ok, there was a race in moving the cvs module.  Someone rebuild from
+  the old cvs into fc3.  *sigh*  bumping rev.
+
+* Wed Jul 14 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.99-3
+- Actually provide libaio.so.1.
+
+* Tue Mar 30 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.99-2
+- Apparently the 0.3.93 patch was not meant for 0.3.96.  Backed it out.
+
+* Tue Mar 30 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.99-1
+- Fix compat calls.
+- make library .so.1.0.0 and make symlinks properly.
+- Fix header file for inclusion in c++ code.
+
+* Thu Feb 26 2004 Jeff Moyer <jmoyer@redhat.com> 0.3.98-2
+- bah.  fix version nr in changelog.
+
+* Thu Feb 26 2004 Jeff Moyer <jmoyer@redhat.com> 0.3.98-1
+- fix compiler warnings.
+
+* Thu Feb 26 2004 Jeff Moyer <jmoyer@redhat.com> 0.3.97-2
+- make srpm was using rpm to do a build.  changed that to use rpmbuild if
+  it exists, and fallback to rpm if it doesn't.
+
+* Tue Feb 24 2004 Jeff Moyer <jmoyer@redhat.com> 0.3.97-1
+- Use libc syscall(2) instead of rolling our own calling mechanism.  This 
+  change is inspired due to a failure to build with newer gcc, since clobber 
+  lists were wrong.
+- Add -fpic to the CFLAGS for all architectures.  Should address bz #109457.
+- change a #include from <linux/types.h> to <sys/types.h>.  Fixes a build
+  issue on s390.
+
+* Wed Jul  7 2003 Bill Nottingham <notting@redhat.com> 0.3.96-3
+- fix paths on lib64 arches
+
+* Wed Jun 18 2003 Michael K. Johnson <johnsonm@redhat.com> 0.3.96-2
+- optimization in io_getevents from Arjan van de Ven in 0.3.96-1
+- deal with ia64 in 0.3.96-2
+
+* Wed May 28 2003 Michael K. Johnson <johnsonm@redhat.com> 0.3.95-1
+- ppc bugfix from Julie DeWandel
+
+* Tue May 20 2003 Michael K. Johnson <johnsonm@redhat.com> 0.3.94-1
+- symbol versioning fix from Ulrich Drepper
+
+* Mon Jan 27 2003 Benjamin LaHaise <bcrl@redhat.com>
+- bump to 0.3.93-3 for rebuild.
+
+* Mon Dec 16 2002 Benjamin LaHaise <bcrl@redhat.com>
+- libaio 0.3.93 test release
+- add powerpc support from Gianni Tedesco <gianni@ecsc.co.uk>
+- add s/390 support from Arnd Bergmann <arnd@bergmann-dalldorf.de>
+
+* Fri Sep 12 2002 Benjamin LaHaise <bcrl@redhat.com>
+- libaio 0.3.92 test release
+- build on x86-64
+
+* Thu Sep 12 2002 Benjamin LaHaise <bcrl@redhat.com>
+- libaio 0.3.91 test release
+- build on ia64
+- remove libredhat-kernel from the .spec file
+
+* Thu Sep  5 2002 Benjamin LaHaise <bcrl@redhat.com>
+- libaio 0.3.90 test release
+
+* Mon Apr 29 2002 Benjamin LaHaise <bcrl@redhat.com>
+- add requires initscripts >= 6.47-1 to get boot time libredhat-kernel 
+  linkage correct.
+- typo fix
+
+* Thu Apr 25 2002 Benjamin LaHaise <bcrl@redhat.com>
+- make /usr/lib/libredhat-kernel.so point to /lib/libredhat-kernel.so.1.0.0
+
+* Mon Apr 15 2002 Tim Powers <timp@redhat.com>
+- make the post scriptlet not use /bin/sh
+
+* Sat Apr 12 2002 Benjamin LaHaise <bcrl@redhat.com>
+- add /lib/libredhat-kernel* to %files.
+
+* Fri Apr 12 2002 Benjamin LaHaise <bcrl@redhat.com>
+- make the dummy install as /lib/libredhat-kernel.so.1.0.0 so 
+  that ldconfig will link against it if no other is installed.
+
+* Tue Jan 22 2002 Benjamin LaHaise <bcrl@redhat.com>
+- add io_getevents
+
+* Tue Jan 22 2002 Michael K. Johnson <johnsonm@redhat.com>
+- Make linker happy with /usr/lib symlink for libredhat-kernel.so
+
+* Mon Jan 21 2002 Michael K. Johnson <johnsonm@redhat.com>
+- Added stub library
+
+* Sun Jan 20 2002 Michael K. Johnson <johnsonm@redhat.com>
+- Initial packaging
diff --git a/tools/libaio/man/aio.3 b/tools/libaio/man/aio.3
new file mode 100644 (file)
index 0000000..6dc3c63
--- /dev/null
@@ -0,0 +1,315 @@
+.TH aio 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio \- Asynchronous IO
+.SH SYNOPSIS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.fi
+.SH DESCRIPTION
+The POSIX.1b standard defines a new set of I/O operations which can
+significantly reduce the time an application spends waiting at I/O.  The
+new functions allow a program to initiate one or more I/O operations and
+then immediately resume normal work while the I/O operations are
+executed in parallel.  This functionality is available if the
+.IR "unistd.h"
+file defines the symbol 
+.B "_POSIX_ASYNCHRONOUS_IO"
+.
+
+These functions are part of the library with realtime functions named
+.IR "librt"
+.  They are not actually part of the 
+.IR "libc" 
+binary.
+The implementation of these functions can be done using support in the
+kernel (if available) or using an implementation based on threads at
+userlevel.  In the latter case it might be necessary to link applications
+with the thread library 
+.IR "libpthread"
+in addition to 
+.IR "librt"
+and
+.IR "libaio"
+.
+
+All AIO operations operate on files which were opened previously.  There
+might be arbitrarily many operations running for one file.  The
+asynchronous I/O operations are controlled using a data structure named
+.IR "struct aiocb"
+It is defined in
+.IR "aio.h"
+ as follows.
+
+.nf
+struct aiocb
+{
+  int aio_fildes;               /* File desriptor.  */
+  int aio_lio_opcode;           /* Operation to be performed.  */
+  int aio_reqprio;              /* Request priority offset.  */
+  volatile void *aio_buf;       /* Location of buffer.  */
+  size_t aio_nbytes;            /* Length of transfer.  */
+  struct sigevent aio_sigevent; /* Signal number and value.  */
+
+  /* Internal members.  */
+  struct aiocb *__next_prio;
+  int __abs_prio;
+  int __policy;
+  int __error_code;
+  __ssize_t __return_value;
+
+#ifndef __USE_FILE_OFFSET64
+  __off_t aio_offset;           /* File offset.  */
+  char __pad[sizeof (__off64_t) - sizeof (__off_t)];
+#else
+  __off64_t aio_offset;         /* File offset.  */
+#endif
+  char __unused[32];
+};
+
+.fi
+The POSIX.1b standard mandates that the 
+.IR "struct aiocb" 
+structure
+contains at least the members described in the following table.  There
+might be more elements which are used by the implementation, but
+depending upon these elements is not portable and is highly deprecated.
+
+.TP
+.IR "int aio_fildes"
+This element specifies the file descriptor to be used for the
+operation.  It must be a legal descriptor, otherwise the operation will
+fail.
+
+The device on which the file is opened must allow the seek operation.
+I.e., it is not possible to use any of the AIO operations on devices
+like terminals where an 
+.IR "lseek"
+ call would lead to an error.
+.TP
+.IR "off_t aio_offset"
+This element specifies the offset in the file at which the operation (input
+or output) is performed.  Since the operations are carried out in arbitrary
+order and more than one operation for one file descriptor can be
+started, one cannot expect a current read/write position of the file
+descriptor.
+.TP
+.IR "volatile void *aio_buf"
+This is a pointer to the buffer with the data to be written or the place
+where the read data is stored.
+.TP
+.IR "size_t aio_nbytes"
+This element specifies the length of the buffer pointed to by 
+.IR "aio_buf"
+.
+.TP
+.IR "int aio_reqprio"
+If the platform has defined 
+.B "_POSIX_PRIORITIZED_IO"
+and
+.B "_POSIX_PRIORITY_SCHEDULING"
+, the AIO requests are
+processed based on the current scheduling priority.  The
+.IR "aio_reqprio"
+element can then be used to lower the priority of the
+AIO operation.
+.TP
+.IR "struct sigevent aio_sigevent"
+This element specifies how the calling process is notified once the
+operation terminates.  If the 
+.IR "sigev_notify"
+element is
+.B "SIGEV_NONE"
+, no notification is sent.  If it is 
+.B "SIGEV_SIGNAL"
+,
+the signal determined by 
+.IR "sigev_signo"
+is sent.  Otherwise,
+.IR "sigev_notify"
+must be 
+.B "SIGEV_THREAD"
+.  In this case, a thread
+is created which starts executing the function pointed to by
+.IR "sigev_notify_function"
+.
+.TP
+.IR "int aio_lio_opcode"
+This element is only used by the 
+.IR "lio_listio"
+ and
+.IR "lio_listio64"
+ functions.  Since these functions allow an
+arbitrary number of operations to start at once, and each operation can be
+input or output (or nothing), the information must be stored in the
+control block.  The possible values are:
+.TP
+.B "LIO_READ"
+Start a read operation.  Read from the file at position
+.IR "aio_offset"
+ and store the next 
+.IR "aio_nbytes"
+ bytes in the
+buffer pointed to by 
+.IR "aio_buf"
+.
+.TP
+.B "LIO_WRITE"
+Start a write operation.  Write 
+.IR "aio_nbytes" 
+bytes starting at
+.IR "aio_buf"
+into the file starting at position 
+.IR "aio_offset"
+.
+.TP
+.B "LIO_NOP"
+Do nothing for this control block.  This value is useful sometimes when
+an array of 
+.IR "struct aiocb"
+values contains holes, i.e., some of the
+values must not be handled although the whole array is presented to the
+.IR "lio_listio"
+function.
+
+When the sources are compiled using 
+.B "_FILE_OFFSET_BITS == 64"
+on a
+32 bit machine, this type is in fact 
+.IR "struct aiocb64"
+, since the LFS
+interface transparently replaces the 
+.IR "struct aiocb"
+definition.
+.PP
+For use with the AIO functions defined in the LFS, there is a similar type
+defined which replaces the types of the appropriate members with larger
+types but otherwise is equivalent to 
+.IR "struct aiocb"
+.  Particularly,
+all member names are the same.
+
+.nf
+/* The same for the 64bit offsets.  Please note that the members aio_fildes
+   to __return_value have to be the same in aiocb and aiocb64.  */
+#ifdef __USE_LARGEFILE64
+struct aiocb64
+{
+  int aio_fildes;               /* File desriptor.  */
+  int aio_lio_opcode;           /* Operation to be performed.  */
+  int aio_reqprio;              /* Request priority offset.  */
+  volatile void *aio_buf;       /* Location of buffer.  */
+  size_t aio_nbytes;            /* Length of transfer.  */
+  struct sigevent aio_sigevent; /* Signal number and value.  */
+
+  /* Internal members.  */
+  struct aiocb *__next_prio;
+  int __abs_prio;
+  int __policy;
+  int __error_code;
+  __ssize_t __return_value;
+
+  __off64_t aio_offset;         /* File offset.  */
+  char __unused[32];
+};
+
+.fi
+.TP
+.IR "int aio_fildes"
+This element specifies the file descriptor which is used for the
+operation.  It must be a legal descriptor since otherwise the operation
+fails for obvious reasons.
+The device on which the file is opened must allow the seek operation.
+I.e., it is not possible to use any of the AIO operations on devices
+like terminals where an 
+.IR "lseek"
+ call would lead to an error.
+.TP
+.IR "off64_t aio_offset"
+This element specifies at which offset in the file the operation (input
+or output) is performed.  Since the operation are carried in arbitrary
+order and more than one operation for one file descriptor can be
+started, one cannot expect a current read/write position of the file
+descriptor.
+.TP
+.IR "volatile void *aio_buf"
+This is a pointer to the buffer with the data to be written or the place
+where the read data is stored.
+.TP
+.IR "size_t aio_nbytes"
+This element specifies the length of the buffer pointed to by 
+.IR "aio_buf"
+.
+.TP
+.IR "int aio_reqprio"
+If for the platform 
+.B "_POSIX_PRIORITIZED_IO"
+and
+.B "_POSIX_PRIORITY_SCHEDULING"
+are defined the AIO requests are
+processed based on the current scheduling priority.  The
+.IR "aio_reqprio"
+element can then be used to lower the priority of the
+AIO operation.
+.TP
+.IR "struct sigevent aio_sigevent"
+This element specifies how the calling process is notified once the
+operation terminates.  If the 
+.IR "sigev_notify"
+, element is
+.B "SIGEV_NONE"
+no notification is sent.  If it is 
+.B "SIGEV_SIGNAL"
+,
+the signal determined by 
+.IR "sigev_signo"
+is sent.  Otherwise,
+.IR "sigev_notify"
+ must be 
+.B "SIGEV_THREAD"
+in which case a thread
+which starts executing the function pointed to by
+.IR "sigev_notify_function"
+.
+.TP
+.IR "int aio_lio_opcode"
+This element is only used by the 
+.IR "lio_listio"
+and
+.IR "lio_listio64"
+functions.  Since these functions allow an
+arbitrary number of operations to start at once, and since each operation can be
+input or output (or nothing), the information must be stored in the
+control block.  See the description of 
+.IR "struct aiocb"
+for a description
+of the possible values.
+.PP
+When the sources are compiled using 
+.B "_FILE_OFFSET_BITS == 64"
+on a
+32 bit machine, this type is available under the name 
+.IR "struct aiocb64"
+, since the LFS transparently replaces the old interface.
+.SH "RETURN VALUES"
+.SH ERRORS
+.SH "SEE ALSO"
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_cancel.3 b/tools/libaio/man/aio_cancel.3
new file mode 100644 (file)
index 0000000..502c83c
--- /dev/null
@@ -0,0 +1,137 @@
+.TH aio_cancel 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_cancel - Cancel asynchronous I/O requests
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_cancel (int fildes " , struct aiocb *aiocbp " )"
+.fi
+.SH DESCRIPTION
+When one or more requests are asynchronously processed, it might be
+useful in some situations to cancel a selected operation, e.g., if it
+becomes obvious that the written data is no longer accurate and would
+have to be overwritten soon.  As an example, assume an application, which
+writes data in files in a situation where new incoming data would have
+to be written in a file which will be updated by an enqueued request.
+The POSIX AIO implementation provides such a function, but this function
+is not capable of forcing the cancellation of the request.  It is up to the
+implementation to decide whether it is possible to cancel the operation
+or not.  Therefore using this function is merely a hint.
+.B "The libaio implementation does not implement the cancel operation in the"
+.B "POSIX libraries".
+.PP
+The 
+.IR aio_cancel
+function can be used to cancel one or more
+outstanding requests.  If the 
+.IR aiocbp 
+parameter is 
+.IR NULL
+, the
+function tries to cancel all of the outstanding requests which would process
+the file descriptor 
+.IR fildes 
+(i.e., whose 
+.IR aio_fildes 
+member
+is 
+.IR fildes
+).  If 
+.IR aiocbp is not 
+.IR  NULL
+,
+.IR aio_cancel
+attempts to cancel the specific request pointed to by 
+.IR aiocbp.
+
+For requests which were successfully canceled, the normal notification
+about the termination of the request should take place.  I.e., depending
+on the 
+.IR "struct sigevent" 
+object which controls this, nothing
+happens, a signal is sent or a thread is started.  If the request cannot
+be canceled, it terminates the usual way after performing the operation.
+After a request is successfully canceled, a call to 
+.IR aio_error
+with
+a reference to this request as the parameter will return
+.B ECANCELED
+and a call to 
+.IR aio_return
+will return 
+.IR -1.
+If the request wasn't canceled and is still running the error status is
+still 
+.B EINPROGRESS.
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+, this
+function is in fact 
+.IR aio_cancel64
+since the LFS interface
+transparently replaces the normal implementation.
+
+.SH "RETURN VALUES"
+.TP
+.B AIO_CANCELED
+If there were
+requests which haven't terminated and which were successfully canceled.
+.TP
+.B AIO_NOTCANCELED
+If there is one or more requests left which couldn't be canceled,
+.  In this case
+.IR aio_error
+must be used to find out which of the, perhaps multiple, requests (in
+.IR aiocbp
+is 
+.IR NULL
+) weren't successfully canceled.  
+.TP
+.B AIO_ALLDONE
+If all
+requests already terminated at the time 
+.IR aio_cancel 
+is called the
+return value is 
+.
+.SH ERRORS
+If an error occurred during the execution of 
+.IR aio_cancel 
+the
+function returns 
+.IR -1
+and sets 
+.IR errno
+to one of the following
+values.
+.TP
+.B EBADF
+The file descriptor 
+.IR fildes
+is not valid.
+.TP
+.B ENOSYS
+.IR aio_cancel
+is not implemented.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_cancel64.3 b/tools/libaio/man/aio_cancel64.3
new file mode 100644 (file)
index 0000000..ede775b
--- /dev/null
@@ -0,0 +1,50 @@
+.TH aio_cancel64 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_cancel64 \- Cancel asynchronous I/O requests
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_cancel64 (int fildes, struct aiocb64 *aiocbp)"
+.fi
+.SH DESCRIPTION
+This function is similar to 
+.IR aio_cancel
+with the only difference
+that the argument is a reference to a variable of type 
+.IR struct aiocb64
+.
+
+When the sources are compiled with 
+.IR _FILE_OFFSET_BITS == 64
+, this
+function is available under the name 
+.IR aio_cancel
+and so
+transparently replaces the interface for small files on 32 bit
+machines.
+.SH "RETURN VALUES"
+See aio_cancel(3).
+.SH ERRORS
+See aio_cancel(3).
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_error.3 b/tools/libaio/man/aio_error.3
new file mode 100644 (file)
index 0000000..12b82cf
--- /dev/null
@@ -0,0 +1,81 @@
+.TH aio_error 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_error \- Getting the Status of AIO Operations
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_error (const struct aiocb *aiocbp)"
+.fi
+.SH DESCRIPTION
+The function
+.IR aio_error
+determines the error state of the request described by the
+.IR "struct aiocb"
+variable pointed to by 
+.I aiocbp
+. 
+
+When the operation is performed truly asynchronously (as with
+.IR "aio_read"
+and 
+.IR "aio_write"
+and with 
+.IR "lio_listio"
+when the mode is 
+.IR "LIO_NOWAIT"
+), one sometimes needs to know whether a
+specific request already terminated and if so, what the result was.
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+this function is in fact 
+.IR "aio_error64"
+since the LFS interface transparently replaces the normal implementation.
+.SH "RETURN VALUES"
+If the request has not yet terminated the value returned is always
+.IR "EINPROGRESS"
+.  Once the request has terminated the value
+.IR "aio_error"
+returns is either 
+.I 0
+if the request completed successfully or it returns the value which would be stored in the
+.IR "errno"
+variable if the request would have been done using
+.IR "read"
+, 
+.IR "write"
+, or 
+.IR "fsync"
+.
+.SH ERRORS
+.TP
+.IR "ENOSYS"
+if it is not implemented.  It
+could also return 
+.TP
+.IR "EINVAL"
+if the 
+.I aiocbp
+parameter does not
+refer to an asynchronous operation whose return status is not yet known.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_error64.3 b/tools/libaio/man/aio_error64.3
new file mode 100644 (file)
index 0000000..3333161
--- /dev/null
@@ -0,0 +1,64 @@
+.TH aio_error64 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_error64 \- Return errors
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_error64 (const struct aiocb64 *aiocbp)"
+.fi
+.SH DESCRIPTION
+This function is similar to 
+.IR aio_error
+with the only difference
+that the argument is a reference to a variable of type 
+.IR "struct aiocb64".
+.PP
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+this
+function is available under the name 
+.IR aio_error
+and so
+transparently replaces the interface for small files on 32 bit
+machines.
+.SH "RETURN VALUES"
+If the request has not yet terminated the value returned is always
+.IR "EINPROGRESS"
+.  Once the request has terminated the value
+.IR "aio_error"
+returns is either 
+.I 0
+if the request completed successfully or it returns the value which would be stored in the
+.IR "errno"
+variable if the request would have been done using
+.IR "read"
+, 
+.IR "write"
+, or 
+.IR "fsync"
+.
+.SH ERRORS
+See 
+.IR aio_error(3).
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_fsync.3 b/tools/libaio/man/aio_fsync.3
new file mode 100644 (file)
index 0000000..637f0f6
--- /dev/null
@@ -0,0 +1,139 @@
+.TH aio_fsync 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_fsync \- Synchronize a file's complete in-core state with that on disk
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_fsync (int op, struct aiocb aiocbp)"
+.fi
+.SH DESCRIPTION
+.PP
+When dealing with asynchronous operations it is sometimes necessary to
+get into a consistent state.  This would mean for AIO that one wants to
+know whether a certain request or a group of request were processed.
+This could be done by waiting for the notification sent by the system
+after the operation terminated, but this sometimes would mean wasting
+resources (mainly computation time).  Instead POSIX.1b defines two
+functions which will help with most kinds of consistency.
+.PP
+The
+.IR aio_fsync
+and 
+.IR "aio_fsync64"
+functions are only available
+if the symbol 
+.IR "_POSIX_SYNCHRONIZED_IO"
+is defined in 
+.I unistd.h
+.
+
+Calling this function forces all I/O operations operating queued at the
+time of the function call operating on the file descriptor
+.IR "aiocbp->aio_fildes"
+into the synchronized I/O completion state .  The 
+.IR "aio_fsync"
+function returns
+immediately but the notification through the method described in
+.IR "aiocbp->aio_sigevent"
+will happen only after all requests for this
+file descriptor have terminated and the file is synchronized.  This also
+means that requests for this very same file descriptor which are queued
+after the synchronization request are not affected.
+
+If 
+.IR "op"
+is 
+.IR "O_DSYNC"
+the synchronization happens as with a call
+to 
+.IR "fdatasync"
+.  Otherwise 
+.IR "op"
+should be 
+.IR "O_SYNC"
+and
+the synchronization happens as with 
+.IR "fsync"
+.
+
+As long as the synchronization has not happened, a call to
+.IR "aio_error"
+with the reference to the object pointed to by
+.IR "aiocbp"
+returns 
+.IR "EINPROGRESS"
+.  Once the synchronization is
+done 
+.IR "aio_error"
+return 
+.IR 0
+if the synchronization was not
+successful.  Otherwise the value returned is the value to which the
+.IR "fsync"
+or 
+.IR "fdatasync"
+function would have set the
+.IR "errno"
+variable.  In this case nothing can be assumed about the
+consistency for the data written to this file descriptor.
+
+.SH "RETURN VALUES"
+The return value of this function is 
+.IR 0
+if the request was
+successfully enqueued.  Otherwise the return value is 
+.IR -1
+and
+.IR "errno".
+.SH ERRORS
+.TP
+.B EAGAIN
+The request could not be enqueued due to temporary lack of resources.
+.TP
+.B EBADF
+The file descriptor 
+.IR "aiocbp->aio_fildes"
+is not valid or not open
+for writing.
+.TP
+.B EINVAL
+The implementation does not support I/O synchronization or the 
+.IR "op"
+parameter is other than 
+.IR "O_DSYNC"
+and 
+.IR "O_SYNC"
+.
+.TP
+.B ENOSYS
+This function is not implemented.
+.PP
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+ this
+function is in fact 
+.IR "aio_return64"
+since the LFS interface
+transparently replaces the normal implementation.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_fsync64.3 b/tools/libaio/man/aio_fsync64.3
new file mode 100644 (file)
index 0000000..5dce22d
--- /dev/null
@@ -0,0 +1,51 @@
+.TH aio_fsync64 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_fsync64 \- Synchronize a file's complete in-core state with that on disk
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_fsync64 (int op, struct aiocb64 *aiocbp)"
+.fi
+.SH DESCRIPTION
+This function is similar to 
+.IR aio_fsync
+with the only difference
+that the argument is a reference to a variable of type 
+.IR "struct aiocb64".
+
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+this
+function is available under the name 
+.IR aio_fsync
+and so
+transparently replaces the interface for small files on 32 bit
+machines.
+.SH "RETURN VALUES"
+See 
+.IR aio_fsync.
+.SH ERRORS
+See 
+.IR aio_fsync.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_init.3 b/tools/libaio/man/aio_init.3
new file mode 100644 (file)
index 0000000..3b0ec95
--- /dev/null
@@ -0,0 +1,96 @@
+.TH  aio_init 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_init \-  How to optimize the AIO implementation
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "void aio_init (const struct aioinit *init)"
+.fi
+.SH DESCRIPTION
+
+The POSIX standard does not specify how the AIO functions are
+implemented.  They could be system calls, but it is also possible to
+emulate them at userlevel.
+
+At the point of this writing, the available implementation is a userlevel
+implementation which uses threads for handling the enqueued requests.
+While this implementation requires making some decisions about
+limitations, hard limitations are something which is best avoided
+in the GNU C library.  Therefore, the GNU C library provides a means
+for tuning the AIO implementation according to the individual use.
+
+.BI "struct aioinit"
+.PP
+This data type is used to pass the configuration or tunable parameters
+to the implementation.  The program has to initialize the members of
+this struct and pass it to the implementation using the 
+.IR aio_init
+function.
+.TP
+.B "int aio_threads"
+This member specifies the maximal number of threads which may be used
+at any one time.
+.TP
+.B "int aio_num"
+This number provides an estimate on the maximal number of simultaneously
+enqueued requests.
+.TP
+.B "int aio_locks"
+Unused.
+.TP
+.B "int aio_usedba"
+Unused.
+.TP
+.B "int aio_debug"
+Unused.
+.TP
+.B "int aio_numusers"
+Unused.
+.TP
+.B "int aio_reserved[2]"
+Unused.
+.PP
+This function must be called before any other AIO function.  Calling it
+is completely voluntary, as it is only meant to help the AIO
+implementation perform better.
+
+Before calling the 
+.IR aio_init
+, function the members of a variable of
+type 
+.IR "struct aioinit"
+must be initialized.  Then a reference to
+this variable is passed as the parameter to 
+.IR aio_init
+which itself
+may or may not pay attention to the hints.
+
+It is a extension which follows a proposal from the SGI implementation in
+.IR Irix 6
+.  It is not covered by POSIX.1b or Unix98.
+.SH "RETURN VALUES"
+The function has no return value.
+.SH ERRORS
+The function has no error cases defined.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_read.3 b/tools/libaio/man/aio_read.3
new file mode 100644 (file)
index 0000000..5bcb6c8
--- /dev/null
@@ -0,0 +1,146 @@
+.TH aio_read 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_read \- Initiate an asynchronous read operation
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_read (struct aiocb *aiocbp)"
+.fi
+.SH DESCRIPTION
+This function initiates an asynchronous read operation.  It
+immediately returns after the operation was enqueued or when an
+error was encountered.
+
+The first 
+.IR "aiocbp->aio_nbytes"
+bytes of the file for which
+.IR "aiocbp->aio_fildes"
+is a descriptor are written to the buffer
+starting at 
+.IR "aiocbp->aio_buf"
+.  Reading starts at the absolute
+position 
+.IR "aiocbp->aio_offset"
+in the file.
+
+If prioritized I/O is supported by the platform the
+.IR "aiocbp->aio_reqprio"
+value is used to adjust the priority before
+the request is actually enqueued.
+
+The calling process is notified about the termination of the read
+request according to the 
+.IR "aiocbp->aio_sigevent"
+value.
+
+.SH "RETURN VALUES"
+When 
+.IR "aio_read"
+returns, the return value is zero if no error
+occurred that can be found before the process is enqueued.  If such an
+early error is found, the function returns 
+.IR -1
+and sets
+.IR "errno".
+
+.PP
+If 
+.IR "aio_read"
+returns zero, the current status of the request
+can be queried using 
+.IR "aio_error"
+and 
+.IR "aio_return"
+functions.
+As long as the value returned by 
+.IR "aio_error"
+is 
+.IR "EINPROGRESS"
+the operation has not yet completed.  If 
+.IR "aio_error"
+returns zero,
+the operation successfully terminated, otherwise the value is to be
+interpreted as an error code.  If the function terminated, the result of
+the operation can be obtained using a call to 
+.IR "aio_return"
+.  The
+returned value is the same as an equivalent call to 
+.IR "read"
+would
+have returned.  
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+this
+function is in fact 
+.IR "aio_read64"
+since the LFS interface transparently
+replaces the normal implementation.
+
+.SH ERRORS
+In the case of an early error:
+.TP
+.B  EAGAIN
+The request was not enqueued due to (temporarily) exceeded resource
+limitations.
+.TP
+.B  ENOSYS
+The 
+.IR "aio_read"
+function is not implemented.
+.TP
+.B  EBADF
+The 
+.IR "aiocbp->aio_fildes"
+descriptor is not valid.  This condition
+need not be recognized before enqueueing the request and so this error
+might also be signaled asynchronously.
+.TP
+.B  EINVAL
+The 
+.IR "aiocbp->aio_offset"
+or 
+.IR "aiocbp->aio_reqpiro"
+value is
+invalid.  This condition need not be recognized before enqueueing the
+request and so this error might also be signaled asynchronously.
+
+.PP
+In the case of a normal return, possible error codes returned by 
+.IR "aio_error"
+are:
+.TP
+.B  EBADF
+The 
+.IR "aiocbp->aio_fildes"
+descriptor is not valid.
+.TP
+.B  ECANCELED
+The operation was canceled before the operation was finished
+.TP
+.B  EINVAL
+The 
+.IR "aiocbp->aio_offset"
+value is invalid.
+.PP
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_read64.3 b/tools/libaio/man/aio_read64.3
new file mode 100644 (file)
index 0000000..8e407a5
--- /dev/null
@@ -0,0 +1,60 @@
+.TH aio_read64 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_read64 \- Initiate an asynchronous read operation
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_read64 (struct aiocb *aiocbp)"
+.fi
+.SH DESCRIPTION
+This function is similar to the 
+.IR "aio_read"
+function.  The only
+difference is that on 
+.IR "32 bit"
+machines, the file descriptor should
+be opened in the large file mode.  Internally, 
+.IR "aio_read64"
+uses
+functionality equivalent to 
+.IR "lseek64"
+to position the file descriptor correctly for the reading,
+as opposed to 
+.IR "lseek"
+functionality used in 
+.IR "aio_read".
+
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+, this
+function is available under the name 
+.IR "aio_read"
+and so transparently
+replaces the interface for small files on 32 bit machines.
+.SH "RETURN VALUES"
+See
+.IR aio_read.
+.SH ERRORS
+See
+.IR aio_read.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_return.3 b/tools/libaio/man/aio_return.3
new file mode 100644 (file)
index 0000000..1e3335f
--- /dev/null
@@ -0,0 +1,71 @@
+.TH aio_return 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_return \- Retrieve status of asynchronous I/O operation
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "ssize_t aio_return (const struct aiocb *aiocbp)"
+.fi
+.SH DESCRIPTION
+This function can be used to retrieve the return status of the operation
+carried out by the request described in the variable pointed to by
+.IR aiocbp
+.  As long as the error status of this request as returned
+by 
+.IR aio_error
+is 
+.IR EINPROGRESS
+the return of this function is
+undefined.
+
+Once the request is finished this function can be used exactly once to
+retrieve the return value.  Following calls might lead to undefined
+behavior.  
+When the sources are compiled with 
+.B "_FILE_OFFSET_BITS == 64"
+this function is in fact 
+.IR aio_return64
+since the LFS interface
+transparently replaces the normal implementation.
+.SH "RETURN VALUES"
+The return value itself is the value which would have been
+returned by the 
+.IR read
+,
+.IR write
+, or 
+.IR fsync
+call.
+.SH ERRORS
+The function can return 
+.TP
+.B ENOSYS
+if it is not implemented.
+.TP
+.B EINVAL 
+if the 
+.IR aiocbp 
+parameter does not
+refer to an asynchronous operation whose return status is not yet known.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_return64.3 b/tools/libaio/man/aio_return64.3
new file mode 100644 (file)
index 0000000..7e78362
--- /dev/null
@@ -0,0 +1,51 @@
+.TH aio_read64 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_read64 \- Retrieve status of asynchronous I/O operation
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_return64 (const struct aiocb64 *aiocbp)"
+.fi
+.SH DESCRIPTION
+This function is similar to 
+.IR "aio_return"
+with the only difference
+that the argument is a reference to a variable of type 
+.IR "struct aiocb64".
+
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+this
+function is available under the name 
+.IR "aio_return"
+and so
+transparently replaces the interface for small files on 32 bit
+machines.
+.SH "RETURN VALUES"
+See 
+.IR aio_return.
+.SH ERRORS
+See
+.IR aio_return.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_suspend.3 b/tools/libaio/man/aio_suspend.3
new file mode 100644 (file)
index 0000000..cae1b65
--- /dev/null
@@ -0,0 +1,123 @@
+.TH aio_suspend 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_suspend \- Wait until one or more requests of a specific set terminates.
+.SH SYNOPSYS
+.nf
+.B "#include <errno.h>"
+.sp
+.br 
+.B "#include <aio.h>"
+.sp
+.br
+.BI "int aio_suspend (const struct aiocb *const list[], int nent, const struct timespec *timeout)"
+.fi
+.SH DESCRIPTION
+Another method of synchronization is to wait until one or more requests of a
+specific set terminated.  This could be achieved by the 
+.IR "aio_*"
+functions to notify the initiating process about the termination but in
+some situations this is not the ideal solution.  In a program which
+constantly updates clients somehow connected to the server it is not
+always the best solution to go round robin since some connections might
+be slow.  On the other hand letting the 
+.IR "aio_*"
+function notify the
+caller might also be not the best solution since whenever the process
+works on preparing data for on client it makes no sense to be
+interrupted by a notification since the new client will not be handled
+before the current client is served.  For situations like this
+.IR "aio_suspend"
+should be used.
+.PP
+When calling this function, the calling thread is suspended until at
+least one of the requests pointed to by the 
+.IR "nent"
+elements of the
+array 
+.IR "list"
+has completed.  If any of the requests has already
+completed at the time 
+.IR "aio_suspend"
+is called, the function returns
+immediately.  Whether a request has terminated or not is determined by
+comparing the error status of the request with 
+.IR "EINPROGRESS"
+.  If
+an element of 
+.IR "list"
+is 
+.IR "NULL"
+, the entry is simply ignored.
+
+If no request has finished, the calling process is suspended.  If
+.IR "timeout"
+is 
+.IR "NULL"
+, the process is not woken until a request
+has finished.  If 
+.IR "timeout"
+is not 
+.IR "NULL"
+, the process remains
+suspended at least as long as specified in 
+.IR "timeout"
+.  In this case,
+.IR "aio_suspend"
+returns with an error.
+.PP
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+this
+function is in fact 
+.IR "aio_suspend64"
+since the LFS interface
+transparently replaces the normal implementation.
+.SH "RETURN VALUES"
+The return value of the function is 
+.IR 0
+if one or more requests
+from the 
+.IR "list"
+have terminated.  Otherwise the function returns
+.IR -1
+and 
+.IR "errno"
+is set.
+.SH ERRORS
+.TP
+.B EAGAIN
+None of the requests from the 
+.IR "list"
+completed in the time specified
+by 
+.IR "timeout"
+.
+.TP
+.B EINTR
+A signal interrupted the 
+.IR "aio_suspend"
+function.  This signal might
+also be sent by the AIO implementation while signalling the termination
+of one of the requests.
+.TP
+.B ENOSYS
+The 
+.IR "aio_suspend"
+function is not implemented.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_suspend64.3 b/tools/libaio/man/aio_suspend64.3
new file mode 100644 (file)
index 0000000..2f289ec
--- /dev/null
@@ -0,0 +1,51 @@
+.TH aio_suspend64 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_suspend64 \- Wait until one or more requests of a specific set terminates
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_suspend64 (const struct aiocb64 *const list[], int nent, const struct timespec *timeout)"
+.fi
+.SH DESCRIPTION
+This function is similar to 
+.IR "aio_suspend"
+with the only difference
+that the argument is a reference to a variable of type 
+.IR "struct aiocb64".
+
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+this
+function is available under the name 
+.IR "aio_suspend"
+and so
+transparently replaces the interface for small files on 32 bit
+machines.
+.SH "RETURN VALUES"
+See
+.IR aio_suspend.
+.SH ERRORS
+See
+.IR aio_suspend.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_write(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_write.3 b/tools/libaio/man/aio_write.3
new file mode 100644 (file)
index 0000000..7c0cfd0
--- /dev/null
@@ -0,0 +1,176 @@
+.TH aio_write 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_write  \-  Initiate an asynchronous write operation
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI "int aio_write (struct aiocb * aiocbp);"
+.fi
+.SH DESCRIPTION
+This function initiates an asynchronous write operation.  The function
+call immediately returns after the operation was enqueued or if before
+this happens an error was encountered.
+
+The first 
+.IR "aiocbp->aio_nbytes"
+bytes from the buffer starting at
+.IR "aiocbp->aio_buf"
+are written to the file for which
+.IR "aiocbp->aio_fildes"
+is an descriptor, starting at the absolute
+position 
+.IR "aiocbp->aio_offset"
+in the file.
+
+If prioritized I/O is supported by the platform, the
+.IR "aiocbp->aio_reqprio "
+value is used to adjust the priority before
+the request is actually enqueued.
+
+The calling process is notified about the termination of the read
+request according to the 
+.IR "aiocbp->aio_sigevent"
+value.
+
+When 
+.IR "aio_write"
+returns, the return value is zero if no error
+occurred that can be found before the process is enqueued.  If such an
+early error is found the function returns 
+.IR -1
+and sets
+.IR "errno"
+to one of the following values.
+
+.TP
+.B EAGAIN
+The request was not enqueued due to (temporarily) exceeded resource
+limitations.
+.TP
+.B ENOSYS
+The 
+.IR "aio_write"
+function is not implemented.
+.TP
+.B EBADF
+The 
+.IR "aiocbp->aio_fildes"
+descriptor is not valid.  This condition
+may not be recognized before enqueueing the request, and so this error
+might also be signaled asynchronously.
+.TP
+.B EINVAL
+The 
+.IR "aiocbp->aio_offset"
+or
+.IR "aiocbp->aio_reqprio"
+value is
+invalid.  This condition may not be recognized before enqueueing the
+request and so this error might also be signaled asynchronously.
+.PP
+
+In the case 
+.IR "aio_write"
+returns zero, the current status of the
+request can be queried using 
+.IR "aio_error"
+and 
+.IR "aio_return"
+functions.  As long as the value returned by 
+.IR "aio_error"
+is
+.IR "EINPROGRESS"
+the operation has not yet completed.  If
+.IR "aio_error"
+returns zero, the operation successfully terminated,
+otherwise the value is to be interpreted as an error code.  If the
+function terminated, the result of the operation can be get using a call
+to 
+.IR "aio_return"
+.  The returned value is the same as an equivalent
+call to 
+.IR "read"
+would have returned.  Possible error codes returned
+by 
+.IR "aio_error"
+are:
+
+.TP
+.B EBADF
+The 
+.IR "aiocbp->aio_fildes"
+descriptor is not valid.
+.TP
+.B ECANCELED
+The operation was canceled before the operation was finished.
+.TP
+.B EINVAL
+The 
+.IR "aiocbp->aio_offset"
+value is invalid.
+.PP
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+, this
+function is in fact 
+.IR "aio_write64"
+since the LFS interface transparently
+replaces the normal implementation.
+.SH "RETURN VALUES"
+When 
+.IR "aio_write"
+returns, the return value is zero if no error
+occurred that can be found before the process is enqueued.  If such an
+early error is found the function returns 
+.IR -1
+and sets
+.IR "errno"
+to one of the following values.
+.SH ERRORS
+.TP
+.B EAGAIN
+The request was not enqueued due to (temporarily) exceeded resource
+limitations.
+.TP
+.B ENOSYS
+The 
+.IR "aio_write"
+function is not implemented.
+.TP
+.B EBADF
+The 
+.IR "aiocbp->aio_fildes"
+descriptor is not valid.  This condition
+may not be recognized before enqueueing the request, and so this error
+might also be signaled asynchronously.
+.TP
+.B EINVAL
+The 
+.IR "aiocbp->aio_offset"
+or
+.IR "aiocbp->aio_reqprio"
+value is
+invalid.  This condition may not be recognized before enqueueing the
+request and so this error might also be signaled asynchronously.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write64(3),
+.BR errno(3),
diff --git a/tools/libaio/man/aio_write64.3 b/tools/libaio/man/aio_write64.3
new file mode 100644 (file)
index 0000000..1080903
--- /dev/null
@@ -0,0 +1,61 @@
+.TH aio_write64 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+aio_write64 \- Initiate an asynchronous write operation
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <aio.h>
+.sp
+.br
+.BI  "int aio_write64 (struct aiocb *aiocbp)"
+.fi
+.SH DESCRIPTION
+This function is similar to the 
+.IR "aio_write"
+function.  The only
+difference is that on 
+.IR "32 bit"
+machines the file descriptor should
+be opened in the large file mode.  Internally 
+.IR "aio_write64"
+uses
+functionality equivalent to 
+.IR "lseek64"
+to position the file descriptor correctly for the writing,
+as opposed to 
+.IR "lseek"
+functionality used in 
+.IR "aio_write".
+
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+, this
+function is available under the name 
+.IR "aio_write"
+and so transparently
+replaces the interface for small files on 32 bit machines.
+.SH "RETURN VALUES"
+See
+.IR aio_write.
+.SH ERRORS
+See
+.IR aio_write.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR errno(3),
diff --git a/tools/libaio/man/io.3 b/tools/libaio/man/io.3
new file mode 100644 (file)
index 0000000..d910a68
--- /dev/null
@@ -0,0 +1,351 @@
+.TH io 3 2002-09-12 "Linux 2.4" Linux IO"
+.SH NAME
+io \- Asynchronous IO
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br 
+.B #include <libio.h>
+.sp
+.fi
+.SH DESCRIPTION
+The libaio library defines a new set of I/O operations which can
+significantly reduce the time an application spends waiting at I/O.  The
+new functions allow a program to initiate one or more I/O operations and
+then immediately resume normal work while the I/O operations are
+executed in parallel.  
+
+These functions are part of the library with realtime functions named
+.IR "libaio"
+.  They are not actually part of the 
+.IR "libc" 
+binary.
+The implementation of these functions can be done using support in the
+kernel.
+
+All IO operations operate on files which were opened previously.  There
+might be arbitrarily many operations running for one file.  The
+asynchronous I/O operations are controlled using a data structure named
+.IR "struct iocb"
+It is defined in
+.IR "libio.h"
+as follows.
+
+.nf
+
+typedef struct io_context *io_context_t;
+
+typedef enum io_iocb_cmd {
+        IO_CMD_PREAD = 0,
+        IO_CMD_PWRITE = 1,
+
+        IO_CMD_FSYNC = 2,
+        IO_CMD_FDSYNC = 3,
+
+        IO_CMD_POLL = 5,
+        IO_CMD_NOOP = 6,
+} io_iocb_cmd_t;
+
+struct io_iocb_common {
+        void            *buf;
+        unsigned        __pad1;
+        long            nbytes;
+        unsigned        __pad2;
+        long long       offset;
+        long long       __pad3, __pad4;
+};      /* result code is the amount read or -'ve errno */
+
+
+struct iocb {
+        void            *data;
+        unsigned        key;
+        short           aio_lio_opcode;
+        short           aio_reqprio;
+        int             aio_fildes;
+        union {
+                struct io_iocb_common           c;
+                struct io_iocb_vector           v;
+                struct io_iocb_poll             poll;
+                struct io_iocb_sockaddr saddr;
+        } u;
+}; 
+
+
+.fi
+.TP
+.IR "int aio_fildes"
+This element specifies the file descriptor to be used for the
+operation.  It must be a legal descriptor, otherwise the operation will
+fail.
+
+The device on which the file is opened must allow the seek operation.
+I.e., it is not possible to use any of the IO operations on devices
+like terminals where an 
+.IR "lseek"
+call would lead to an error.
+.TP
+.IR "long u.c.offset"
+This element specifies the offset in the file at which the operation (input
+or output) is performed.  Since the operations are carried out in arbitrary
+order and more than one operation for one file descriptor can be
+started, one cannot expect a current read/write position of the file
+descriptor.
+.TP
+.IR "void *buf"
+This is a pointer to the buffer with the data to be written or the place
+where the read data is stored.
+.TP
+.IR "long u.c.nbytes"
+This element specifies the length of the buffer pointed to by 
+.IR "io_buf"
+.
+.TP
+.IR "int aio_reqprio"
+Is not currently used.
+.TP
+.B "IO_CMD_PREAD"
+Start a read operation.  Read from the file at position
+.IR "u.c.offset"
+and store the next 
+.IR "u.c.nbytes"
+bytes in the
+buffer pointed to by 
+.IR "buf"
+.
+.TP
+.B "IO_CMD_PWRITE"
+Start a write operation.  Write 
+.IR "u.c.nbytes" 
+bytes starting at
+.IR "buf"
+into the file starting at position 
+.IR "u.c.offset"
+.
+.TP
+.B "IO_CMD_NOP"
+Do nothing for this control block.  This value is useful sometimes when
+an array of 
+.IR "struct iocb"
+values contains holes, i.e., some of the
+values must not be handled although the whole array is presented to the
+.IR "io_submit"
+function.
+.TP 
+.B "IO_CMD_FSYNC"
+.TP
+.B "IO_CMD_POLL"
+This is experimental.
+.SH EXAMPLE
+.nf
+/*
+ * Simplistic version of copy command using async i/o
+ *
+ * From:       Stephen Hemminger <shemminger@osdl.org>
+ * Copy file by using a async I/O state machine.
+ * 1. Start read request
+ * 2. When read completes turn it into a write request
+ * 3. When write completes decrement counter and free resources
+ *
+ *
+ * Usage: aiocp file(s) desination
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <libaio.h>
+
+#define AIO_BLKSIZE    (64*1024)
+#define AIO_MAXIO      32
+
+static int busy = 0;           // # of I/O's in flight
+static int tocopy = 0;         // # of blocks left to copy
+static int dstfd = -1;         // destination file descriptor
+static const char *dstname = NULL;
+static const char *srcname = NULL;
+
+
+/* Fatal error handler */
+static void io_error(const char *func, int rc)
+{
+    if (rc == -ENOSYS)
+       fprintf(stderr, "AIO not in this kernel\n");
+    else if (rc < 0 && -rc < sys_nerr)
+       fprintf(stderr, "%s: %s\n", func, sys_errlist[-rc]);
+    else
+       fprintf(stderr, "%s: error %d\n", func, rc);
+
+    if (dstfd > 0)
+       close(dstfd);
+    if (dstname)
+       unlink(dstname);
+    exit(1);
+}
+
+/*
+ * Write complete callback.
+ * Adjust counts and free resources
+ */
+static void wr_done(io_context_t ctx, struct iocb *iocb, long res, long res2)
+{
+    if (res2 != 0) {
+       io_error("aio write", res2);
+    }
+    if (res != iocb->u.c.nbytes) {
+       fprintf(stderr, "write missed bytes expect %d got %d\n", iocb->u.c.nbytes, res2);
+       exit(1);
+    }
+    --tocopy;
+    --busy;
+    free(iocb->u.c.buf);
+
+    memset(iocb, 0xff, sizeof(iocb));  // paranoia
+    free(iocb);
+    write(2, "w", 1);
+}
+
+/*
+ * Read complete callback.
+ * Change read iocb into a write iocb and start it.
+ */
+static void rd_done(io_context_t ctx, struct iocb *iocb, long res, long res2)
+{
+    /* library needs accessors to look at iocb? */
+    int iosize = iocb->u.c.nbytes;
+    char *buf = iocb->u.c.buf;
+    off_t offset = iocb->u.c.offset;
+
+    if (res2 != 0)
+       io_error("aio read", res2);
+    if (res != iosize) {
+       fprintf(stderr, "read missing bytes expect %d got %d\n", iocb->u.c.nbytes, res);
+       exit(1);
+    }
+
+
+    /* turn read into write */
+    io_prep_pwrite(iocb, dstfd, buf, iosize, offset);
+    io_set_callback(iocb, wr_done);
+    if (1 != (res = io_submit(ctx, 1, &iocb)))
+       io_error("io_submit write", res);
+    write(2, "r", 1);
+}
+
+
+int main(int argc, char *const *argv)
+{
+    int srcfd;
+    struct stat st;
+    off_t length = 0, offset = 0;
+    io_context_t myctx;
+
+    if (argc != 3 || argv[1][0] == '-') {
+       fprintf(stderr, "Usage: aiocp SOURCE DEST");
+       exit(1);
+    }
+    if ((srcfd = open(srcname = argv[1], O_RDONLY)) < 0) {
+       perror(srcname);
+       exit(1);
+    }
+    if (fstat(srcfd, &st) < 0) {
+       perror("fstat");
+       exit(1);
+    }
+    length = st.st_size;
+
+    if ((dstfd = open(dstname = argv[2], O_WRONLY | O_CREAT, 0666)) < 0) {
+       close(srcfd);
+       perror(dstname);
+       exit(1);
+    }
+
+    /* initialize state machine */
+    memset(&myctx, 0, sizeof(myctx));
+    io_queue_init(AIO_MAXIO, &myctx);
+    tocopy = howmany(length, AIO_BLKSIZE);
+
+    while (tocopy > 0) {
+       int i, rc;
+       /* Submit as many reads as once as possible upto AIO_MAXIO */
+       int n = MIN(MIN(AIO_MAXIO - busy, AIO_MAXIO / 2),
+                   howmany(length - offset, AIO_BLKSIZE));
+       if (n > 0) {
+           struct iocb *ioq[n];
+
+           for (i = 0; i < n; i++) {
+               struct iocb *io = (struct iocb *) malloc(sizeof(struct iocb));
+               int iosize = MIN(length - offset, AIO_BLKSIZE);
+               char *buf = (char *) malloc(iosize);
+
+               if (NULL == buf || NULL == io) {
+                   fprintf(stderr, "out of memory\n");
+                   exit(1);
+               }
+
+               io_prep_pread(io, srcfd, buf, iosize, offset);
+               io_set_callback(io, rd_done);
+               ioq[i] = io;
+               offset += iosize;
+           }
+
+           rc = io_submit(myctx, n, ioq);
+           if (rc < 0)
+               io_error("io_submit", rc);
+
+           busy += n;
+       }
+
+       // Handle IO's that have completed
+       rc = io_queue_run(myctx);
+       if (rc < 0)
+           io_error("io_queue_run", rc);
+
+       // if we have maximum number of i/o's in flight
+       // then wait for one to complete
+       if (busy == AIO_MAXIO) {
+           rc = io_queue_wait(myctx, NULL);
+           if (rc < 0)
+               io_error("io_queue_wait", rc);
+       }
+
+    }
+
+    close(srcfd);
+    close(dstfd);
+    exit(0);
+}
+
+/* 
+ * Results look like:
+ * [alanm@toolbox ~/MOT3]$ ../taio kernel-source-2.4.8-0.4g.ppc.rpm abc
+ * rrrrrrrrrrrrrrrwwwrwrrwwrrwrwwrrwrwrwwrrwrwrrrrwwrwwwrrwrrrwwwwwwwwwwwwwwwww
+ * rrrrrrrrrrrrrrwwwrrwrwrwrwrrwwwwwwwwwwwwwwrrrrrrrrrrrrrrrrrrwwwwrwrwwrwrwrwr
+ * wrrrrrrrwwwwwwwwwwwwwrrrwrrrwrrwrwwwwwwwwwwrrrrwwrwrrrrrrrrrrrwwwwwwwwwwwrww
+ * wwwrrrrrrrrwwrrrwwrwrwrwwwrrrrrrrwwwrrwwwrrwrwwwwwwwwrrrrrrrwwwrrrrrrrwwwwww
+ * wwwwwwwrwrrrrrrrrwrrwrrwrrwrwrrrwrrrwrrrwrwwwwwwwwwwwwwwwwwwrrrwwwrrrrrrrrrr
+ * rrwrrrrrrwrrwwwwwwwwwwwwwwwwrwwwrrwrwwrrrrrrrrrrrrrrrrrrrwwwwwwwwwwwwwwwwwww
+ * rrrrrwrrwrwrwrrwrrrwwwwwwwwrrrrwrrrwrwwrwrrrwrrwrrrrwwwwwwwrwrwwwwrwwrrrwrrr
+ * rrrwwwwwwwrrrrwwrrrrrrrrrrrrwrwrrrrwwwwwwwwwwwwwwrwrrrrwwwwrwrrrrwrwwwrrrwww
+ * rwwrrrrrrrwrrrrrrrrrrrrwwwwrrrwwwrwrrwwwwwwwwwwwwwwwwwwwwwrrrrrrrwwwwwwwrw
+ */
+.fi
+.SH "SEE ALSO"
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_cancel.1 b/tools/libaio/man/io_cancel.1
new file mode 100644 (file)
index 0000000..16e898a
--- /dev/null
@@ -0,0 +1,21 @@
+.\"/* sys_io_cancel:
+.\" *      Attempts to cancel an iocb previously passed to io_submit.  If
+.\" *      the operation is successfully cancelled, the resulting event is
+.\" *      copied into the memory pointed to by result without being placed
+.\" *      into the completion queue and 0 is returned.  May fail with
+.\" *      -EFAULT if any of the data structures pointed to are invalid.
+.\" *      May fail with -EINVAL if aio_context specified by ctx_id is
+.\" *      invalid.  May fail with -EAGAIN if the iocb specified was not
+.\" *      cancelled.  Will fail with -ENOSYS if not implemented.
+.\" */
+.\"
+.TH io_cancel 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_cancel \- cancel io requests
+.SH SYNOPSIS
+.B #include <errno.h>
+.br
+.B #include <libaio.h>
+.LP
+.BI "int io_submit(io_context_t " ctx ", struct iocb *" iocb ", struct io_event *" result ");"
+
diff --git a/tools/libaio/man/io_cancel.3 b/tools/libaio/man/io_cancel.3
new file mode 100644 (file)
index 0000000..9a16084
--- /dev/null
@@ -0,0 +1,65 @@
+.TH io_cancel 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_cancel \- Cancel io requests
+.SH SYNOPSIS
+.nf
+.B #include <errno.h>
+.sp
+.br
+.B #include <libaio.h>
+.sp
+.br
+.BI "int io_cancel(io_context_t ctx, struct iocb *iocb)"
+.br
+.sp
+struct iocb {
+       void            *data; /* Return in the io completion event */
+       unsigned        key;    /* For use in identifying io requests */
+       short           aio_lio_opcode;
+       short           aio_reqprio;    /* Not used */
+       int             aio_fildes;
+};
+.fi
+.SH DESCRIPTION
+Attempts to cancel an iocb previously passed to io_submit.  If
+the operation is successfully cancelled, the resulting event is
+copied into the memory pointed to by result without being placed
+into the completion queue.
+.PP
+When one or more requests are asynchronously processed, it might be
+useful in some situations to cancel a selected operation, e.g., if it
+becomes obvious that the written data is no longer accurate and would
+have to be overwritten soon.  As an example, assume an application, which
+writes data in files in a situation where new incoming data would have
+to be written in a file which will be updated by an enqueued request.
+.SH "RETURN VALUES"
+0 is returned on success , otherwise returns Errno.
+.SH ERRORS
+.TP
+.B EFAULT 
+If any of the data structures pointed to are invalid.
+.TP
+.B EINVAL 
+If aio_context specified by ctx_id is
+invalid.  
+.TP
+.B EAGAIN
+If the iocb specified was not
+cancelled.  
+.TP
+.B ENOSYS 
+if not implemented.
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_destroy.1 b/tools/libaio/man/io_destroy.1
new file mode 100644 (file)
index 0000000..177683b
--- /dev/null
@@ -0,0 +1,17 @@
+.\"/* sys_io_destroy:
+.\" *      Destroy the aio_context specified.  May cancel any outstanding 
+.\" *      AIOs and block on completion.  Will fail with -ENOSYS if not
+.\" *      implemented.  May fail with -EFAULT if the context pointed to
+.\" *      is invalid.
+.\" */
+.\" libaio provides this as io_queue_release.
+.TH io_destroy 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_destroy \- destroy an io context
+.SH SYNOPSIS
+.B #include <errno.h>
+.br
+.B #include <libaio.h>
+.LP
+.BI "int io_destroy(io_context_t " ctx ");"
+
diff --git a/tools/libaio/man/io_fsync.3 b/tools/libaio/man/io_fsync.3
new file mode 100644 (file)
index 0000000..53eb63d
--- /dev/null
@@ -0,0 +1,82 @@
+./" static inline int io_fsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd)
+./" {
+./"    io_prep_fsync(iocb, fd);
+./"    io_set_callback(iocb, cb);
+./"    return io_submit(ctx, 1, &iocb);
+./" }
+.TH io_fsync 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+io_fsync \- Synchronize a file's complete in-core state with that on disk
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br
+.B #include <libaio.h>
+.sp
+.br
+.BI "int io_fsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd)"
+.sp
+struct iocb {
+       void            *data;
+       unsigned        key;
+       short           aio_lio_opcode;
+       short           aio_reqprio;
+       int             aio_fildes;
+};
+.sp
+typedef void (*io_callback_t)(io_context_t ctx, struct iocb *iocb, long res, long res2);
+.sp
+.fi
+.SH DESCRIPTION
+When dealing with asynchronous operations it is sometimes necessary to
+get into a consistent state.  This would mean for AIO that one wants to
+know whether a certain request or a group of request were processed.
+This could be done by waiting for the notification sent by the system
+after the operation terminated, but this sometimes would mean wasting
+resources (mainly computation time). 
+.PP
+Calling this function forces all I/O operations operating queued at the
+time of the function call operating on the file descriptor
+.IR "iocb->io_fildes"
+into the synchronized I/O completion state .  The 
+.IR "io_fsync"
+function returns
+immediately but the notification through the method described in
+.IR "io_callback"
+will happen only after all requests for this
+file descriptor have terminated and the file is synchronized.  This also
+means that requests for this very same file descriptor which are queued
+after the synchronization request are not affected.
+.SH "RETURN VALUES"
+Returns 0, otherwise returns errno.
+.SH ERRORS
+.TP
+.B EFAULT
+.I iocbs
+referenced data outside of the program's accessible address space.
+.TP
+.B EINVAL
+.I ctx
+refers to an unitialized aio context, the iocb pointed to by 
+.I iocbs
+contains an improperly initialized iocb, 
+.TP
+.B EBADF
+The iocb contains a file descriptor that does not exist.
+.TP
+.B EINVAL
+The file specified in the iocb does not support the given io operation.
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_getevents(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_getevents.1 b/tools/libaio/man/io_getevents.1
new file mode 100644 (file)
index 0000000..27730b9
--- /dev/null
@@ -0,0 +1,29 @@
+./"/* io_getevents:
+./" *      Attempts to read at least min_nr events and up to nr events from
+./" *      the completion queue for the aio_context specified by ctx_id.  May
+./" *      fail with -EINVAL if ctx_id is invalid, if min_nr is out of range,
+./" *      if nr is out of range, if when is out of range.  May fail with
+./" *      -EFAULT if any of the memory specified to is invalid.  May return
+./" *      0 or < min_nr if no events are available and the timeout specified
+./" *      by when has elapsed, where when == NULL specifies an infinite
+./" *      timeout.  Note that the timeout pointed to by when is relative and
+./" *      will be updated if not NULL and the operation blocks.  Will fail
+./" *      with -ENOSYS if not implemented.
+./" */
+./"asmlinkage long sys_io_getevents(io_context_t ctx_id,
+./"                                 long min_nr,
+./"                                 long nr,
+./"                                 struct io_event *events,
+./"                                 struct timespec *timeout)
+./"
+.TH io_getevents 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_getevents \- read resulting events from io requests
+.SH SYNOPSIS
+.B #include <errno.h>
+.br
+.B #include <libaio.h>
+.sp
+.BI "int io_getevents(io_context_t " ctx ", long " min_nr ", long " nr ", struct io_events *" events "[], struct timespec *" timeout ");"
+
+
diff --git a/tools/libaio/man/io_getevents.3 b/tools/libaio/man/io_getevents.3
new file mode 100644 (file)
index 0000000..8e9ddc8
--- /dev/null
@@ -0,0 +1,79 @@
+./"/* io_getevents:
+./" *      Attempts to read at least min_nr events and up to nr events from
+./" *      the completion queue for the aio_context specified by ctx_id.  May
+./" *      fail with -EINVAL if ctx_id is invalid, if min_nr is out of range,
+./" *      if nr is out of range, if when is out of range.  May fail with
+./" *      -EFAULT if any of the memory specified to is invalid.  May return
+./" *      0 or < min_nr if no events are available and the timeout specified
+./" *      by when has elapsed, where when == NULL specifies an infinite
+./" *      timeout.  Note that the timeout pointed to by when is relative and
+./" *      will be updated if not NULL and the operation blocks.  Will fail
+./" *      with -ENOSYS if not implemented.
+./" */
+./"asmlinkage long sys_io_getevents(io_context_t ctx_id,
+./"                                 long min_nr,
+./"                                 long nr,
+./"                                 struct io_event *events,
+./"                                 struct timespec *timeout)
+./"
+.TH io_getevents 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_getevents \- Read resulting events from io requests
+.SH SYNOPSIS
+.nf
+.B #include <errno.h>
+.sp
+.br
+.B #include <libaio.h>
+.br
+.sp
+struct iocb {
+       void            *data;
+       unsigned        key;
+       short           aio_lio_opcode;
+       short           aio_reqprio;
+       int             aio_fildes;
+};
+.sp
+struct io_event {
+        unsigned        PADDED(data, __pad1);
+        unsigned        PADDED(obj,  __pad2);
+        unsigned        PADDED(res,  __pad3);
+        unsigned        PADDED(res2, __pad4);
+};
+.sp
+.BI "int io_getevents(io_context_t " ctx ",  long " nr ", struct io_event *" events "[], struct timespec *" timeout ");"
+
+.fi
+.SH DESCRIPTION
+Attempts to read  up to nr events from
+the completion queue for the aio_context specified by ctx.  
+.SH "RETURN VALUES"
+May return
+0 if no events are available and the timeout specified
+by when has elapsed, where when == NULL specifies an infinite
+timeout.  Note that the timeout pointed to by when is relative and
+will be updated if not NULL and the operation blocks.  Will fail
+with ENOSYS if not implemented.
+.SH ERRORS
+.TP
+.B EINVAL 
+if ctx_id is invalid, if min_nr is out of range,
+if nr is out of range, if when is out of range.  
+.TP
+.B EFAULT 
+if any of the memory specified to is invalid.  
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_prep_fsync.3 b/tools/libaio/man/io_prep_fsync.3
new file mode 100644 (file)
index 0000000..4cf935a
--- /dev/null
@@ -0,0 +1,89 @@
+./" static inline void io_prep_fsync(struct iocb *iocb, int fd)
+./" {
+./"    memset(iocb, 0, sizeof(*iocb));
+./"    iocb->aio_fildes = fd;
+./"    iocb->aio_lio_opcode = IO_CMD_FSYNC;
+./"    iocb->aio_reqprio = 0;
+./" }
+.TH io_prep_fsync 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+io_prep_fsync \- Synchronize a file's complete in-core state with that on disk
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.br
+.sp
+.B #include <libaio.h>
+.br
+.sp
+.BI "static inline void io_prep_fsync(struct iocb *iocb, int fd)"
+.sp
+struct iocb {
+       void            *data;
+       unsigned        key;
+       short           aio_lio_opcode;
+       short           aio_reqprio;
+       int             aio_fildes;
+};
+.sp
+.fi
+.SH DESCRIPTION
+This is an inline convenience function for setting up an iocbv for a FSYNC request.
+.br
+The file for which
+.TP 
+.IR "iocb->aio_fildes = fd" 
+is a descriptor is set up with
+the command
+.TP 
+.IR "iocb->aio_lio_opcode = IO_CMD_FSYNC:
+.
+.PP
+The io_prep_fsync() function shall set up an IO_CMD_FSYNC operation
+to asynchronously force all I/O
+operations associated with the file indicated by the file
+descriptor aio_fildes member of the iocb structure referenced by
+the iocb argument and queued at the time of the call to
+io_submit() to the synchronized I/O completion state. The function
+call shall return when the synchronization request has been
+initiated or queued to the file or device (even when the data
+cannot be synchronized immediately).
+
+All currently queued I/O operations shall be completed as if by a call
+to fsync(); that is, as defined for synchronized I/O file
+integrity completion. If the
+operation queued by io_prep_fsync() fails, then, as for fsync(),
+outstanding I/O operations are not guaranteed to have
+been completed.
+
+If io_prep_fsync() succeeds, then it is only the I/O that was queued
+at the time of the call to io_submit() that is guaranteed to be
+forced to the relevant completion state. The completion of
+subsequent I/O on the file descriptor is not guaranteed to be
+completed in a synchronized fashion.
+.PP
+This function returns immediately . To schedule the operation, the
+function
+.IR io_submit
+must be called.
+.PP
+Simultaneous asynchronous operations using the same iocb produce
+undefined results.
+.SH "RETURN VALUES"
+None
+.SH ERRORS
+None
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_prep_pread.3 b/tools/libaio/man/io_prep_pread.3
new file mode 100644 (file)
index 0000000..5938aec
--- /dev/null
@@ -0,0 +1,79 @@
+./" static inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
+./" {
+./"    memset(iocb, 0, sizeof(*iocb));
+./"    iocb->aio_fildes = fd;
+./"    iocb->aio_lio_opcode = IO_CMD_PREAD;
+./"    iocb->aio_reqprio = 0;
+./"    iocb->u.c.buf = buf;
+./"    iocb->u.c.nbytes = count;
+./"    iocb->u.c.offset = offset;
+./" }
+.TH io_prep_pread 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+io_prep_pread \- Set up asynchronous read
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.sp
+.br
+.B #include <libaio.h>
+.br
+.sp
+.BI "inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
+"
+.sp
+struct iocb {
+       void            *data;
+       unsigned        key;
+       short           aio_lio_opcode;
+       short           aio_reqprio;
+       int             aio_fildes;
+};
+.fi
+.SH DESCRIPTION
+.IR io_prep_pread 
+is an inline convenience function designed to facilitate the initialization of
+the iocb for an asynchronous read operation.
+
+The first
+.TP
+.IR "iocb->u.c.nbytes = count"
+bytes of the file for which
+.TP
+.IR "iocb->aio_fildes = fd"
+is a descriptor are written to the buffer
+starting at
+.TP
+.IR "iocb->u.c.buf = buf"
+.
+.br
+Reading starts at the absolute position
+.TP
+.IR "ioc->u.c.offset = offset"
+in the file.
+.PP
+This function returns immediately . To schedule the operation, the
+function 
+.IR io_submit
+must be called.
+.PP
+Simultaneous asynchronous operations using the same iocb produce
+undefined results.
+.SH "RETURN VALUES"
+None
+.SH ERRORS
+None
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_prep_pwrite.3 b/tools/libaio/man/io_prep_pwrite.3
new file mode 100644 (file)
index 0000000..68b3500
--- /dev/null
@@ -0,0 +1,77 @@
+./" static inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
+./" {
+./"    memset(iocb, 0, sizeof(*iocb));
+./"    iocb->aio_fildes = fd;
+./"    iocb->aio_lio_opcode = IO_CMD_PWRITE;
+./"    iocb->aio_reqprio = 0;
+./"    iocb->u.c.buf = buf;
+./"    iocb->u.c.nbytes = count;
+./"    iocb->u.c.offset = offset;
+./" }
+.TH io_prep_pwrite 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+io_prep_pwrite \- Set up iocb for asynchronous writes
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.br
+.sp
+.B #include <libaio.h>
+.br
+.sp
+.BI "inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
+"
+.sp
+struct iocb {
+       void            *data;
+       unsigned        key;
+       short           aio_lio_opcode;
+       short           aio_reqprio;
+       int             aio_fildes;
+};
+.fi
+.SH DESCRIPTION
+io_prep_write is a convenicence function for setting up parallel writes.
+
+The first
+.TP
+.IR "iocb->u.c.nbytes = count"
+bytes of the file for which
+.TP
+.IR "iocb->aio_fildes = fd"
+is a descriptor are written from the buffer
+starting at
+.TP
+.IR "iocb->u.c.buf = buf"
+.
+.br
+Writing starts at the absolute position
+.TP
+.IR "ioc->u.c.offset = offset"
+in the file.
+.PP
+This function returns immediately . To schedule the operation, the
+function
+.IR io_submit
+must be called.
+.PP
+Simultaneous asynchronous operations using the same iocb produce
+undefined results.
+.SH "RETURN VALUES"
+None
+.SH ERRORS
+None
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_queue_init.3 b/tools/libaio/man/io_queue_init.3
new file mode 100644 (file)
index 0000000..317f631
--- /dev/null
@@ -0,0 +1,63 @@
+.TH io_queue_init 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_queue_init \- Initialize asynchronous io state machine
+
+.SH SYNOPSIS
+.nf
+.B #include <errno.h>
+.br
+.sp
+.B #include <libaio.h>
+.br
+.sp
+.BI "int io_queue_init(int maxevents, io_context_t  *ctx );"
+.sp
+.fi
+.SH DESCRIPTION
+.B io_queue_init
+Attempts to create an aio context capable of receiving at least 
+.IR maxevents
+events. 
+.IR ctx
+must point to an aio context that already exists and must be initialized
+to 
+.IR 0
+before the call.
+If the operation is successful, *cxtp is filled with the resulting handle.
+.SH "RETURN VALUES"
+On success,
+.B io_queue_init
+returns 0.  Otherwise, -error is return, where
+error is one of the Exxx values defined in the Errors section.
+.SH ERRORS
+.TP
+.B EFAULT
+.I iocbs
+referenced data outside of the program's accessible address space.
+.TP
+.B EINVAL
+.I maxevents
+is <= 0 or 
+.IR ctx
+is an invalid memory locattion.
+.TP
+.B ENOSYS 
+Not implemented
+.TP
+.B EAGAIN
+.IR "maxevents > max_aio_reqs"
+where max_aio_reqs is a tunable value.
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_queue_release.3 b/tools/libaio/man/io_queue_release.3
new file mode 100644 (file)
index 0000000..06b9ec0
--- /dev/null
@@ -0,0 +1,48 @@
+.TH io_queue_release 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_queue_release \- Release the context associated with the userspace handle
+.SH SYNOPSIS
+.nf
+.B #include <errno.h>
+.br
+.B #include <libaio.h>
+.br
+.sp
+.BI "int io_queue_release(io_context_t ctx)"
+.sp
+.SH DESCRIPTION
+.B io_queue_release
+destroys the context associated with the userspace handle.    May cancel any outstanding
+AIOs and block on completion.
+
+.B cts.
+.SH "RETURN VALUES"
+On success,
+.B io_queue_release
+returns 0.  Otherwise, -error is return, where
+error is one of the Exxx values defined in the Errors section.
+.SH ERRORS
+.TP
+.B EINVAL
+.I ctx 
+refers to an unitialized aio context, the iocb pointed to by
+.I iocbs 
+contains an improperly initialized iocb,
+.TP
+.B ENOSYS 
+Not implemented
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
+
diff --git a/tools/libaio/man/io_queue_run.3 b/tools/libaio/man/io_queue_run.3
new file mode 100644 (file)
index 0000000..57dd417
--- /dev/null
@@ -0,0 +1,50 @@
+.TH io_queue_run 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_queue_run \- Handle completed io requests
+.SH SYNOPSIS
+.nf
+.B #include <errno.h>
+.br
+.sp
+.B #include <libaio.h>
+.br
+.sp
+.BI "int io_queue_run(io_context_t  ctx );"
+.sp
+.fi
+.SH DESCRIPTION
+.B io_queue_run
+Attempts to read  all the events events from
+the completion queue for the aio_context specified by ctx_id.
+.SH "RETURN VALUES"
+May return
+0 if no events are available.
+Will fail with -ENOSYS if not implemented.
+.SH ERRORS
+.TP
+.B EFAULT
+.I iocbs
+referenced data outside of the program's accessible address space.
+.TP
+.B EINVAL
+.I ctx 
+refers to an unitialized aio context, the iocb pointed to by
+.I iocbs 
+contains an improperly initialized iocb,
+.TP
+.B ENOSYS 
+Not implemented
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_queue_wait.3 b/tools/libaio/man/io_queue_wait.3
new file mode 100644 (file)
index 0000000..2306663
--- /dev/null
@@ -0,0 +1,56 @@
+.TH io_queue_wait 2 2002-09-03 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_queue_wait \- Wait for io requests to complete
+.SH SYNOPSIS
+.nf
+.B #include <errno.h>
+.br
+.sp
+.B #include <libaio.h>
+.br
+.sp
+.BI "int io_queue_wait(io_context_t ctx, const struct timespec *timeout);"
+.fi
+.SH DESCRIPTION
+Attempts to read  an event from
+the completion queue for the aio_context specified by ctx_id.
+.SH "RETURN VALUES"
+May return
+0 if no events are available and the timeout specified
+by when has elapsed, where when == NULL specifies an infinite
+timeout.  Note that the timeout pointed to by when is relative and
+will be updated if not NULL and the operation blocks.  Will fail
+with -ENOSYS if not implemented.
+.SH "RETURN VALUES"
+On success,
+.B io_queue_wait
+returns 0.  Otherwise, -error is return, where
+error is one of the Exxx values defined in the Errors section.
+.SH ERRORS
+.TP
+.B EFAULT
+.I iocbs
+referenced data outside of the program's accessible address space.
+.TP
+.B EINVAL
+.I ctx 
+refers to an unitialized aio context, the iocb pointed to by
+.I iocbs 
+contains an improperly initialized iocb,
+.TP
+.B ENOSYS 
+Not implemented
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_set_callback(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_set_callback.3 b/tools/libaio/man/io_set_callback.3
new file mode 100644 (file)
index 0000000..a8ca789
--- /dev/null
@@ -0,0 +1,44 @@
+./"\ 3static inline void io_set_callback(struct iocb *iocb, io_callback_t cb)
+.TH io_set_callback 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+io_set_callback \- Set up io completion callback function
+.SH SYNOPSYS
+.nf
+.B #include <errno.h>
+.br
+.sp
+.B #include <libaio.h>
+.br
+.sp
+.BI "static inline void io_set_callback(struct iocb *iocb, io_callback_t cb)"
+.sp
+struct iocb {
+       void            *data;
+       unsigned        key;
+       short           aio_lio_opcode;
+       short           aio_reqprio;
+       int             aio_fildes;
+};
+.sp
+typedef void (*io_callback_t)(io_context_t ctx, struct iocb *iocb, long res, long res2);
+.sp
+.fi
+.SH DESCRIPTION
+The callback is not done if the caller uses raw events from 
+io_getevents, only with the library helpers
+.SH "RETURN VALUES"
+.SH ERRORS
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_submit(3),
+.BR errno(3)
diff --git a/tools/libaio/man/io_setup.1 b/tools/libaio/man/io_setup.1
new file mode 100644 (file)
index 0000000..68690e1
--- /dev/null
@@ -0,0 +1,15 @@
+./"/* sys_io_setup:
+./" *      Create an aio_context capable of receiving at least nr_events.
+./" *      ctxp must not point to an aio_context that already exists, and
+./" *      must be initialized to 0 prior to the call.  On successful
+./" *      creation of the aio_context, *ctxp is filled in with the resulting 
+./" *      handle.  May fail with -EINVAL if *ctxp is not initialized,
+./" *      if the specified nr_events exceeds internal limits.  May fail 
+./" *      with -EAGAIN if the specified nr_events exceeds the user's limit 
+./" *      of available events.  May fail with -ENOMEM if insufficient kernel
+./" *      resources are available.  May fail with -EFAULT if an invalid
+./" *      pointer is passed for ctxp.  Will fail with -ENOSYS if not
+./" *      implemented.
+./" */
+./" -- note: libaio is actually providing io_queue_init and io_queue_grow
+./" as separate functions.  For now io_setup is the same as io_queue_grow.
diff --git a/tools/libaio/man/io_submit.1 b/tools/libaio/man/io_submit.1
new file mode 100644 (file)
index 0000000..f66e80f
--- /dev/null
@@ -0,0 +1,109 @@
+.TH io_submit 2 2002-09-02 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_submit \- submit io requests
+.SH SYNOPSIS
+.B #include <errno.h>
+.br
+.B #include <libaio.h>
+.LP
+.BI "int io_submit(io_context_t " ctx ", long " nr ", struct iocb *" iocbs "[]);"
+.SH DESCRIPTION
+.B io_submit
+submits to the io_context
+.I ctx
+up to
+.I nr
+I/O requests pointed to by the vector
+.IR iocbs .
+
+The
+.B iocb
+structure is defined as something like
+.sp
+.RS
+.nf
+struct iocb {
+    void    *data;
+.\"    unsigned    key;
+    short    aio_lio_opcode;
+    short    aio_reqprio;
+    int      aio_fildes;
+};
+.fi
+.RE
+.sp
+.I data
+is a an opaque pointer which will upon completion be returned in the
+.B io_event
+structure by
+.BR io_getevents (2).
+.\" and io_wait(2)
+Callers will typically use this to point directly or indirectly to a
+callback function.
+.sp
+.I aio_lio_opcode
+is the I/O operation requested.  Callers will typically set this and the
+arguments to the I/O operation calling the
+.BR io_prep_ (3)
+function corresponding to the operation.
+.sp
+.I aio_reqprio
+is the priority of the request.  Higher values have more priority; the
+normal priority is 0.
+.sp
+.I aio_fildes
+is the file descriptor for the I/O operation.
+Callers will typically set this and the
+arguments to the I/O operation calling the
+.BR io_prep_ *(3)
+function corresponding to the operation.
+.sp
+The caller may not modify the contents or resubmit a submitted
+.B iocb
+structure until after the operation completes or is canceled.
+The implementation of
+.BR io_submit (2)
+is permitted to modify reserved fields of the
+.B iocb
+structure.
+.SH "RETURN VALUES"
+If able to submit at least one iocb,
+.B io_submit
+returns the number of iocbs submitted successfully.  Otherwise, 
+.RI - error
+is returned, where 
+.I error
+is one of the Exxx values defined in the Errors section.
+.SH ERRORS
+.TP
+.B EFAULT
+.I iocbs
+referenced data outside of the program's accessible address space.
+.TP
+.B EINVAL
+.I nr
+is negative,
+.I ctx
+refers to an uninitialized aio context, the iocb pointed to by 
+.IR iocbs [0]
+is improperly initialized or specifies an unsupported operation.
+.TP
+.B EBADF
+The iocb pointed to by
+.IR iocbs [0]
+contains a file descriptor that does not exist.
+.TP
+.B EAGAIN
+Insufficient resources were available to queue any operations.
+.SH "SEE ALSO"
+.BR io_setup (2),
+.BR io_destroy (2),
+.BR io_getevents (2),
+.\".BR io_wait (2),
+.BR io_prep_pread (3),
+.BR io_prep_pwrite (3),
+.BR io_prep_fsync (3),
+.BR io_prep_fdsync (3),
+.BR io_prep_noop (3),
+.BR io_cancel (2),
+.BR errno (3)
diff --git a/tools/libaio/man/io_submit.3 b/tools/libaio/man/io_submit.3
new file mode 100644 (file)
index 0000000..b6966ef
--- /dev/null
@@ -0,0 +1,135 @@
+./"/* sys_io_submit:
+./" *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+./" *      the number of iocbs queued.  May return -EINVAL if the aio_context
+./" *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
+./" *      *iocbpp[0] is not properly initialized, if the operation specified
+./" *      is invalid for the file descriptor in the iocb.  May fail with
+./" *      -EFAULT if any of the data structures point to invalid data.  May
+./" *      fail with -EBADF if the file descriptor specified in the first
+./" *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
+./" *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+./" *      fail with -ENOSYS if not implemented.
+./" */
+.TH io_submit 2 2002-09-02 "Linux 2.4" "Linux AIO"
+.SH NAME
+io_submit \- Submit io requests
+.SH SYNOPSIS
+.nf
+.B #include <errno.h>
+.br
+.sp
+.B #include <libaio.h>
+.br
+.sp
+.BI "int io_submit(io_context_t " ctx ", long " nr ", struct iocb *" iocbs "[]);"
+.sp
+struct iocb {
+       void            *data;
+       unsigned        key;
+       short           aio_lio_opcode;
+       short           aio_reqprio;
+       int             aio_fildes;
+};
+.fi
+.SH DESCRIPTION
+.B io_submit
+submits
+.I nr
+iocbs for processing for a given io context ctx.
+
+The 
+.IR "io_submit"
+function can be used to enqueue an arbitrary
+number of read and write requests at one time.  The requests can all be
+meant for the same file, all for different files or every solution in
+between.
+
+.IR "io_submit"
+gets the 
+.IR "nr"
+requests from the array pointed to
+by 
+.IR "iocbs"
+.  The operation to be performed is determined by the
+.IR "aio_lio_opcode"
+member in each element of 
+.IR "iocbs"
+.  If this
+field is 
+.B "IO_CMD_PREAD"
+a read operation is enqueued, similar to a call
+of 
+.IR "io_prep_pread"
+for this element of the array (except that the way
+the termination is signalled is different, as we will see below).  If
+the 
+.IR "aio_lio_opcode"
+member is 
+.B "IO_CMD_PWRITE"
+a write operation
+is enqueued.  Otherwise the 
+.IR "aio_lio_opcode"
+must be 
+.B "IO_CMD_NOP"
+in which case this element of 
+.IR "iocbs"
+is simply ignored.  This
+``operation'' is useful in situations where one has a fixed array of
+.IR "struct iocb"
+elements from which only a few need to be handled at
+a time.  Another situation is where the 
+.IR "io_submit"
+call was
+canceled before all requests are processed  and the remaining requests have to be reissued.
+
+The other members of each element of the array pointed to by
+.IR "iocbs"
+must have values suitable for the operation as described in
+the documentation for 
+.IR "io_prep_pread"
+and 
+.IR "io_prep_pwrite"
+above.
+
+The function returns immediately after
+having enqueued all the requests.  
+On success,
+.B io_submit
+returns the number of iocbs submitted successfully.  Otherwise, -error is return, where 
+error is one of the Exxx values defined in the Errors section.
+.PP
+If an error is detected, then the behavior is undefined.
+.PP
+Simultaneous asynchronous operations using the same iocb produce
+undefined results.
+.SH ERRORS
+.TP
+.B EFAULT
+.I iocbs
+referenced data outside of the program's accessible address space.
+.TP
+.B EINVAL
+.I ctx
+refers to an unitialized aio context, the iocb pointed to by 
+.I iocbs
+contains an improperly initialized iocb, 
+.TP
+.B EBADF
+The iocb contains a file descriptor that does not exist.
+.TP
+.B EINVAL
+The file specified in the iocb does not support the given io operation.
+.SH "SEE ALSO"
+.BR io(3),
+.BR io_cancel(3),
+.BR io_fsync(3),
+.BR io_getevents(3),
+.BR io_prep_fsync(3),
+.BR io_prep_pread(3),
+.BR io_prep_pwrite(3),
+.BR io_queue_init(3),
+.BR io_queue_release(3),
+.BR io_queue_run(3),
+.BR io_queue_wait(3),
+.BR io_set_callback(3),
+.BR errno(3)
diff --git a/tools/libaio/man/lio_listio.3 b/tools/libaio/man/lio_listio.3
new file mode 100644 (file)
index 0000000..9b5b5e4
--- /dev/null
@@ -0,0 +1,229 @@
+.TH  lio_listio 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+lio_listio - List directed I/O
+.SH SYNOPSYS
+.B #include <errno.h>
+.br
+.B #include <libaio.h>
+.LP
+.BI "int lio_listio (int mode, struct aiocb *const list[], int nent, struct sigevent *sig)"
+.nf
+.SH DESCRIPTION
+
+Besides these functions with the more or less traditional interface,
+POSIX.1b also defines a function which can initiate more than one
+operation at a time, and which can handle freely mixed read and write
+operations.  It is therefore similar to a combination of 
+.IR readv
+and
+.IR "writev"
+.
+
+The 
+.IR "lio_listio"
+function can be used to enqueue an arbitrary
+number of read and write requests at one time.  The requests can all be
+meant for the same file, all for different files or every solution in
+between.
+
+.IR "lio_listio"
+gets the 
+.IR "nent"
+requests from the array pointed to
+by 
+.IR "list"
+.  The operation to be performed is determined by the
+.IR "aio_lio_opcode"
+member in each element of 
+.IR "list"
+.  If this
+field is 
+.B "LIO_READ"
+a read operation is enqueued, similar to a call
+of 
+.IR "aio_read"
+for this element of the array (except that the way
+the termination is signalled is different, as we will see below).  If
+the 
+.IR "aio_lio_opcode"
+member is 
+.B "LIO_WRITE"
+a write operation
+is enqueued.  Otherwise the 
+.IR "aio_lio_opcode"
+must be 
+.B "LIO_NOP"
+in which case this element of 
+.IR "list"
+is simply ignored.  This
+``operation'' is useful in situations where one has a fixed array of
+.IR "struct aiocb"
+elements from which only a few need to be handled at
+a time.  Another situation is where the 
+.IR "lio_listio"
+call was
+canceled before all requests are processed  and the remaining requests have to be reissued.
+
+The other members of each element of the array pointed to by
+.IR "list"
+must have values suitable for the operation as described in
+the documentation for 
+.IR "aio_read"
+and 
+.IR "aio_write"
+above.
+
+The 
+.IR "mode"
+argument determines how 
+.IR "lio_listio"
+behaves after
+having enqueued all the requests.  If 
+.IR "mode"
+is 
+.B "LIO_WAIT"
+it
+waits until all requests terminated.  Otherwise 
+.IR "mode"
+must be
+.B "LIO_NOWAIT"
+and in this case the function returns immediately after
+having enqueued all the requests.  In this case the caller gets a
+notification of the termination of all requests according to the
+.IR "sig"
+parameter.  If 
+.IR "sig"
+is 
+.B "NULL"
+no notification is
+send.  Otherwise a signal is sent or a thread is started, just as
+described in the description for 
+.IR "aio_read"
+or 
+.IR "aio_write"
+.
+
+When the sources are compiled with 
+.B "_FILE_OFFSET_BITS == 64"
+, this
+function is in fact 
+.IR "lio_listio64"
+since the LFS interface
+transparently replaces the normal implementation.
+.SH "RETURN VALUES"
+If 
+.IR "mode"
+is 
+.B "LIO_WAIT"
+, the return value of 
+.IR "lio_listio"
+is 
+.IR 0
+when all requests completed successfully.  Otherwise the
+function return 
+.IR 1
+and 
+.IR "errno"
+is set accordingly.  To find
+out which request or requests failed one has to use the 
+.IR "aio_error"
+function on all the elements of the array 
+.IR "list"
+.
+
+In case 
+.IR "mode"
+is 
+.B "LIO_NOWAIT"
+, the function returns 
+.IR 0
+if
+all requests were enqueued correctly.  The current state of the requests
+can be found using 
+.IR "aio_error"
+and 
+.IR "aio_return"
+as described
+above.  If 
+.IR "lio_listio"
+returns 
+.IR -1
+in this mode, the
+global variable 
+.IR "errno"
+is set accordingly.  If a request did not
+yet terminate, a call to 
+.IR "aio_error"
+returns 
+.B "EINPROGRESS"
+.  If
+the value is different, the request is finished and the error value (or
+
+.IR 0
+) is returned and the result of the operation can be retrieved
+using 
+.IR "aio_return"
+.
+.SH ERRORS
+Possible values for 
+.IR "errno"
+are:
+
+.TP
+.B EAGAIN
+The resources necessary to queue all the requests are not available at
+the moment.  The error status for each element of 
+.IR "list"
+must be
+checked to determine which request failed.
+
+Another reason could be that the system wide limit of AIO requests is
+exceeded.  This cannot be the case for the implementation on GNU systems
+since no arbitrary limits exist.
+.TP
+.B EINVAL
+The 
+.IR "mode"
+parameter is invalid or 
+.IR "nent"
+is larger than
+.B "AIO_LISTIO_MAX"
+.
+.TP
+.B EIO
+One or more of the request's I/O operations failed.  The error status of
+each request should be checked to determine which one failed.
+.TP
+.B ENOSYS
+The 
+.IR "lio_listio"
+function is not supported.
+.PP
+
+If the 
+.IR "mode"
+parameter is 
+.B "LIO_NOWAIT"
+and the caller cancels
+a request, the error status for this request returned by
+.IR "aio_error"
+is 
+.B "ECANCELED"
+.
+.SH "SEE ALSO"
+.BR aio(3),
+.BR aio_cancel(3),
+.BR aio_cancel64(3),
+.BR aio_error(3),
+.BR aio_error64(3),
+.BR aio_fsync(3),
+.BR aio_fsync64(3),
+.BR aio_init(3),
+.BR aio_read(3),
+.BR aio_read64(3),
+.BR aio_return(3),
+.BR aio_return64(3),
+.BR aio_suspend(3),
+.BR aio_suspend64(3),
+.BR aio_write(3),
+.BR aio_write64(3)
diff --git a/tools/libaio/man/lio_listio64.3 b/tools/libaio/man/lio_listio64.3
new file mode 100644 (file)
index 0000000..97f6955
--- /dev/null
@@ -0,0 +1,39 @@
+.TH lio_listio64 3 2002-09-12 "Linux 2.4" Linux AIO"
+.SH NAME
+lio_listio64 \- List directed I/O
+.SH SYNOPSYS
+.B #include <errno.h>
+.br
+.B #include <libaio.h>
+.LP
+.BI "int lio_listio64 (int mode, struct aiocb *const list[], int nent, struct sigevent *sig)"
+.nf
+.SH DESCRIPTION
+This function is similar to the 
+.IR "code{lio_listio"
+function.  The only
+difference is that on 
+.IR "32 bit"
+machines, the file descriptor should
+be opened in the large file mode.  Internally, 
+.IR "lio_listio64"
+uses
+functionality equivalent to 
+.IR lseek64"
+to position the file descriptor correctly for the reading or
+writing, as opposed to 
+.IR "lseek"
+functionality used in
+.IR "lio_listio".
+
+When the sources are compiled with 
+.IR "_FILE_OFFSET_BITS == 64"
+, this
+function is available under the name 
+.IR "lio_listio"
+and so
+transparently replaces the interface for small files on 32 bit
+machines.
+.SH "RETURN VALUES"
+.SH ERRORS
+.SH "SEE ALSO"
diff --git a/tools/libaio/src/Makefile b/tools/libaio/src/Makefile
new file mode 100644 (file)
index 0000000..8d134cc
--- /dev/null
@@ -0,0 +1,64 @@
+prefix=/usr
+includedir=$(prefix)/include
+libdir=$(prefix)/lib
+
+ARCH := $(shell uname -m | sed -e s/i.86/i386/)
+CFLAGS := -nostdlib -nostartfiles -Wall -I. -g -fomit-frame-pointer -O2 -fPIC
+SO_CFLAGS=-shared $(CFLAGS)
+L_CFLAGS=$(CFLAGS)
+LINK_FLAGS=
+
+soname=libaio.so.1
+minor=0
+micro=1
+libname=$(soname).$(minor).$(micro)
+all_targets += libaio.a $(libname)
+
+all: $(all_targets)
+
+# libaio provided functions
+libaio_srcs := io_queue_init.c io_queue_release.c
+libaio_srcs += io_queue_wait.c io_queue_run.c
+
+# real syscalls
+libaio_srcs += io_getevents.c io_submit.c io_cancel.c
+libaio_srcs += io_setup.c io_destroy.c
+
+# internal functions
+libaio_srcs += raw_syscall.c
+
+# old symbols
+libaio_srcs += compat-0_1.c
+
+libaio_objs := $(patsubst %.c,%.ol,$(libaio_srcs))
+libaio_sobjs := $(patsubst %.c,%.os,$(libaio_srcs))
+
+$(libaio_objs) $(libaio_sobjs): libaio.h vsys_def.h
+
+%.os: %.c
+       $(CC) $(SO_CFLAGS) -c -o $@ $<
+
+%.ol: %.c
+       $(CC) $(L_CFLAGS) -c -o $@ $<
+
+
+libaio.a: $(libaio_objs)
+       rm -f libaio.a
+       ar r libaio.a $^
+       ranlib libaio.a
+
+$(libname): $(libaio_sobjs) libaio.map
+       $(CC) $(SO_CFLAGS) -Wl,--version-script=libaio.map -Wl,-soname=$(soname) -o $@ $(libaio_sobjs) $(LINK_FLAGS)
+
+install: $(all_targets)
+       install -D -m 644 libaio.h $(includedir)/libaio.h
+       install -D -m 644 libaio.a $(libdir)/libaio.a
+       install -D -m 755 $(libname) $(libdir)/$(libname)
+       ln -sf $(libname) $(libdir)/$(soname)
+       ln -sf $(libname) $(libdir)/libaio.so
+
+$(libaio_objs): libaio.h
+
+clean:
+       rm -f $(all_targets) $(libaio_objs) $(libaio_sobjs) $(soname).new
+       rm -f *.so* *.a *.o
diff --git a/tools/libaio/src/compat-0_1.c b/tools/libaio/src/compat-0_1.c
new file mode 100644 (file)
index 0000000..136396f
--- /dev/null
@@ -0,0 +1,62 @@
+/* libaio Linux async I/O interface
+
+   compat-0_1.c : compatibility symbols for libaio 0.1.x-0.3.x
+
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <stdlib.h>
+#include <asm/errno.h>
+
+#include "libaio.h"
+#include "vsys_def.h"
+
+#include "syscall.h"
+
+
+/* ABI change.  Provide partial compatibility on this one for now. */
+SYMVER(compat0_1_io_cancel, io_cancel, 0.1);
+int compat0_1_io_cancel(io_context_t ctx, struct iocb *iocb)
+{
+       struct io_event event;
+
+       /* FIXME: the old ABI would return the event on the completion queue */
+       return io_cancel(ctx, iocb, &event);
+}
+
+SYMVER(compat0_1_io_queue_wait, io_queue_wait, 0.1);
+int compat0_1_io_queue_wait(io_context_t ctx, struct timespec *when)
+{
+       struct timespec timeout;
+       if (when)
+               timeout = *when;
+       return io_getevents(ctx, 0, 0, NULL, when ? &timeout : NULL);
+}
+
+
+/* ABI change.  Provide backwards compatibility for this one. */
+SYMVER(compat0_1_io_getevents, io_getevents, 0.1);
+int compat0_1_io_getevents(io_context_t ctx_id, long nr,
+                      struct io_event *events,
+                      const struct timespec *const_timeout)
+{
+       struct timespec timeout;
+       if (const_timeout)
+               timeout = *const_timeout;
+       return io_getevents(ctx_id, 1, nr, events,
+                       const_timeout ? &timeout : NULL);
+}
+
diff --git a/tools/libaio/src/io_cancel.c b/tools/libaio/src/io_cancel.c
new file mode 100644 (file)
index 0000000..2f0f5f4
--- /dev/null
@@ -0,0 +1,23 @@
+/* io_cancel.c
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <libaio.h>
+#include "syscall.h"
+
+io_syscall3(int, io_cancel_0_4, io_cancel, io_context_t, ctx, struct iocb *, iocb, struct io_event *, event)
+DEFSYMVER(io_cancel_0_4, io_cancel, 0.4)
diff --git a/tools/libaio/src/io_destroy.c b/tools/libaio/src/io_destroy.c
new file mode 100644 (file)
index 0000000..0ab6bd1
--- /dev/null
@@ -0,0 +1,23 @@
+/* io_destroy
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <errno.h>
+#include <libaio.h>
+#include "syscall.h"
+
+io_syscall1(int, io_destroy, io_destroy, io_context_t, ctx)
diff --git a/tools/libaio/src/io_getevents.c b/tools/libaio/src/io_getevents.c
new file mode 100644 (file)
index 0000000..5a05174
--- /dev/null
@@ -0,0 +1,57 @@
+/* io_getevents.c
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <libaio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <time.h>
+#include "syscall.h"
+
+io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx, long, min_nr, long, nr, struct io_event *, events, struct timespec *, timeout)
+
+#define AIO_RING_MAGIC                  0xa10a10a1
+
+/* Ben will hate me for this */
+struct aio_ring {
+       unsigned        id;     /* kernel internal index number */
+       unsigned        nr;     /* number of io_events */
+       unsigned        head;
+       unsigned        tail;
+       unsigned        magic;
+       unsigned        compat_features;
+       unsigned        incompat_features;
+       unsigned        header_length;  /* size of aio_ring */
+};
+
+int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct io_event * events, struct timespec * timeout)
+{
+       struct aio_ring *ring;
+       ring = (struct aio_ring*)ctx;
+       if (ring==NULL || ring->magic != AIO_RING_MAGIC)
+               goto do_syscall;
+       if (timeout!=NULL && timeout->tv_sec == 0 && timeout->tv_nsec == 0) {
+               if (ring->head == ring->tail)
+                       return 0;
+       }
+       
+do_syscall:    
+       return __io_getevents_0_4(ctx, min_nr, nr, events, timeout);
+}
+
+DEFSYMVER(io_getevents_0_4, io_getevents, 0.4)
diff --git a/tools/libaio/src/io_queue_init.c b/tools/libaio/src/io_queue_init.c
new file mode 100644 (file)
index 0000000..563d137
--- /dev/null
@@ -0,0 +1,33 @@
+/* io_queue_init.c
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <libaio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+#include "syscall.h"
+
+int io_queue_init(int maxevents, io_context_t *ctxp)
+{
+       if (maxevents > 0) {
+               *ctxp = NULL;
+               return io_setup(maxevents, ctxp);
+       }
+       return -EINVAL;
+}
diff --git a/tools/libaio/src/io_queue_release.c b/tools/libaio/src/io_queue_release.c
new file mode 100644 (file)
index 0000000..94bbb86
--- /dev/null
@@ -0,0 +1,27 @@
+/* io_queue_release.c
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <libaio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+int io_queue_release(io_context_t ctx)
+{
+       return io_destroy(ctx);
+}
diff --git a/tools/libaio/src/io_queue_run.c b/tools/libaio/src/io_queue_run.c
new file mode 100644 (file)
index 0000000..e0132f4
--- /dev/null
@@ -0,0 +1,39 @@
+/* io_submit
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <libaio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <time.h>
+
+int io_queue_run(io_context_t ctx)
+{
+       static struct timespec timeout = { 0, 0 };
+       struct io_event event;
+       int ret;
+
+       /* FIXME: batch requests? */
+       while (1 == (ret = io_getevents(ctx, 0, 1, &event, &timeout))) {
+               io_callback_t cb = (io_callback_t)event.data;
+               struct iocb *iocb = event.obj;
+
+               cb(ctx, iocb, event.res, event.res2);
+       }
+
+       return ret;
+}
diff --git a/tools/libaio/src/io_queue_wait.c b/tools/libaio/src/io_queue_wait.c
new file mode 100644 (file)
index 0000000..538d2f3
--- /dev/null
@@ -0,0 +1,31 @@
+/* io_submit
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#define NO_SYSCALL_ERRNO
+#include <sys/types.h>
+#include <libaio.h>
+#include <errno.h>
+#include "syscall.h"
+
+struct timespec;
+
+int io_queue_wait_0_4(io_context_t ctx, struct timespec *timeout)
+{
+       return io_getevents(ctx, 0, 0, NULL, timeout);
+}
+DEFSYMVER(io_queue_wait_0_4, io_queue_wait, 0.4)
diff --git a/tools/libaio/src/io_setup.c b/tools/libaio/src/io_setup.c
new file mode 100644 (file)
index 0000000..4ba1afc
--- /dev/null
@@ -0,0 +1,23 @@
+/* io_setup
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <errno.h>
+#include <libaio.h>
+#include "syscall.h"
+
+io_syscall2(int, io_setup, io_setup, int, maxevents, io_context_t *, ctxp)
diff --git a/tools/libaio/src/io_submit.c b/tools/libaio/src/io_submit.c
new file mode 100644 (file)
index 0000000..e22ba54
--- /dev/null
@@ -0,0 +1,23 @@
+/* io_submit
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <errno.h>
+#include <libaio.h>
+#include "syscall.h"
+
+io_syscall3(int, io_submit, io_submit, io_context_t, ctx, long, nr, struct iocb **, iocbs)
diff --git a/tools/libaio/src/libaio.h b/tools/libaio/src/libaio.h
new file mode 100644 (file)
index 0000000..6574601
--- /dev/null
@@ -0,0 +1,222 @@
+/* /usr/include/libaio.h
+ *
+ * Copyright 2000,2001,2002 Red Hat, Inc.
+ *
+ * Written by Benjamin LaHaise <bcrl@redhat.com>
+ *
+ * libaio Linux async I/O interface
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#ifndef __LIBAIO_H
+#define __LIBAIO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <string.h>
+
+struct timespec;
+struct sockaddr;
+struct iovec;
+struct iocb;
+
+typedef struct io_context *io_context_t;
+
+typedef enum io_iocb_cmd {
+       IO_CMD_PREAD = 0,
+       IO_CMD_PWRITE = 1,
+
+       IO_CMD_FSYNC = 2,
+       IO_CMD_FDSYNC = 3,
+
+       IO_CMD_POLL = 5,
+       IO_CMD_NOOP = 6,
+} io_iocb_cmd_t;
+
+#if defined(__i386__) /* little endian, 32 bits */
+#define PADDED(x, y)   x; unsigned y
+#define PADDEDptr(x, y)        x; unsigned y
+#define PADDEDul(x, y) unsigned long x; unsigned y
+#elif defined(__ia64__) || defined(__x86_64__) || defined(__alpha__)
+#define PADDED(x, y)   x, y
+#define PADDEDptr(x, y)        x
+#define PADDEDul(x, y) unsigned long x
+#elif defined(__powerpc64__) /* big endian, 64 bits */
+#define PADDED(x, y)   unsigned y; x
+#define PADDEDptr(x,y) x
+#define PADDEDul(x, y) unsigned long x
+#elif defined(__PPC__)  /* big endian, 32 bits */
+#define PADDED(x, y)   unsigned y; x
+#define PADDEDptr(x, y)        unsigned y; x
+#define PADDEDul(x, y) unsigned y; unsigned long x
+#elif defined(__s390x__) /* big endian, 64 bits */
+#define PADDED(x, y)   unsigned y; x
+#define PADDEDptr(x,y) x
+#define PADDEDul(x, y) unsigned long x
+#elif defined(__s390__) /* big endian, 32 bits */
+#define PADDED(x, y)   unsigned y; x
+#define PADDEDptr(x, y) unsigned y; x
+#define PADDEDul(x, y) unsigned y; unsigned long x
+#else
+#error endian?
+#endif
+
+struct io_iocb_poll {
+       PADDED(int events, __pad1);
+};     /* result code is the set of result flags or -'ve errno */
+
+struct io_iocb_sockaddr {
+       struct sockaddr *addr;
+       int             len;
+};     /* result code is the length of the sockaddr, or -'ve errno */
+
+struct io_iocb_common {
+       PADDEDptr(void  *buf, __pad1);
+       PADDEDul(nbytes, __pad2);
+       long long       offset;
+       long long       __pad3, __pad4;
+};     /* result code is the amount read or -'ve errno */
+
+struct io_iocb_vector {
+       const struct iovec      *vec;
+       int                     nr;
+       long long               offset;
+};     /* result code is the amount read or -'ve errno */
+
+struct iocb {
+       PADDEDptr(void *data, __pad1);  /* Return in the io completion event */
+       PADDED(unsigned key, __pad2);   /* For use in identifying io requests */
+
+       short           aio_lio_opcode; 
+       short           aio_reqprio;
+       int             aio_fildes;
+
+       union {
+               struct io_iocb_common           c;
+               struct io_iocb_vector           v;
+               struct io_iocb_poll             poll;
+               struct io_iocb_sockaddr saddr;
+       } u;
+};
+
+struct io_event {
+       PADDEDptr(void *data, __pad1);
+       PADDEDptr(struct iocb *obj,  __pad2);
+       PADDEDul(res,  __pad3);
+       PADDEDul(res2, __pad4);
+};
+
+#undef PADDED
+#undef PADDEDptr
+#undef PADDEDul
+
+typedef void (*io_callback_t)(io_context_t ctx, struct iocb *iocb, long res, long res2);
+
+/* library wrappers */
+extern int io_queue_init(int maxevents, io_context_t *ctxp);
+/*extern int io_queue_grow(io_context_t ctx, int new_maxevents);*/
+extern int io_queue_release(io_context_t ctx);
+/*extern int io_queue_wait(io_context_t ctx, struct timespec *timeout);*/
+extern int io_queue_run(io_context_t ctx);
+
+/* Actual syscalls */
+extern int io_setup(int maxevents, io_context_t *ctxp);
+extern int io_destroy(io_context_t ctx);
+extern int io_submit(io_context_t ctx, long nr, struct iocb *ios[]);
+extern int io_cancel(io_context_t ctx, struct iocb *iocb, struct io_event *evt);
+extern int io_getevents(io_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout);
+
+
+static inline void io_set_callback(struct iocb *iocb, io_callback_t cb)
+{
+       iocb->data = (void *)cb;
+}
+
+static inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
+{
+       memset(iocb, 0, sizeof(*iocb));
+       iocb->aio_fildes = fd;
+       iocb->aio_lio_opcode = IO_CMD_PREAD;
+       iocb->aio_reqprio = 0;
+       iocb->u.c.buf = buf;
+       iocb->u.c.nbytes = count;
+       iocb->u.c.offset = offset;
+}
+
+static inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
+{
+       memset(iocb, 0, sizeof(*iocb));
+       iocb->aio_fildes = fd;
+       iocb->aio_lio_opcode = IO_CMD_PWRITE;
+       iocb->aio_reqprio = 0;
+       iocb->u.c.buf = buf;
+       iocb->u.c.nbytes = count;
+       iocb->u.c.offset = offset;
+}
+
+static inline void io_prep_poll(struct iocb *iocb, int fd, int events)
+{
+       memset(iocb, 0, sizeof(*iocb));
+       iocb->aio_fildes = fd;
+       iocb->aio_lio_opcode = IO_CMD_POLL;
+       iocb->aio_reqprio = 0;
+       iocb->u.poll.events = events;
+}
+
+static inline int io_poll(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd, int events)
+{
+       io_prep_poll(iocb, fd, events);
+       io_set_callback(iocb, cb);
+       return io_submit(ctx, 1, &iocb);
+}
+
+static inline void io_prep_fsync(struct iocb *iocb, int fd)
+{
+       memset(iocb, 0, sizeof(*iocb));
+       iocb->aio_fildes = fd;
+       iocb->aio_lio_opcode = IO_CMD_FSYNC;
+       iocb->aio_reqprio = 0;
+}
+
+static inline int io_fsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd)
+{
+       io_prep_fsync(iocb, fd);
+       io_set_callback(iocb, cb);
+       return io_submit(ctx, 1, &iocb);
+}
+
+static inline void io_prep_fdsync(struct iocb *iocb, int fd)
+{
+       memset(iocb, 0, sizeof(*iocb));
+       iocb->aio_fildes = fd;
+       iocb->aio_lio_opcode = IO_CMD_FDSYNC;
+       iocb->aio_reqprio = 0;
+}
+
+static inline int io_fdsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd)
+{
+       io_prep_fdsync(iocb, fd);
+       io_set_callback(iocb, cb);
+       return io_submit(ctx, 1, &iocb);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __LIBAIO_H */
diff --git a/tools/libaio/src/libaio.map b/tools/libaio/src/libaio.map
new file mode 100644 (file)
index 0000000..dc37725
--- /dev/null
@@ -0,0 +1,22 @@
+LIBAIO_0.1 {
+       global:
+               io_queue_init;
+               io_queue_run;
+               io_queue_wait;
+               io_queue_release;
+               io_cancel;
+               io_submit;
+               io_getevents;
+       local:
+               *;
+
+};
+
+LIBAIO_0.4 {
+       global:
+               io_setup;
+               io_destroy;
+               io_cancel;
+               io_getevents;
+               io_queue_wait;
+} LIBAIO_0.1;
diff --git a/tools/libaio/src/raw_syscall.c b/tools/libaio/src/raw_syscall.c
new file mode 100644 (file)
index 0000000..3c8d7fa
--- /dev/null
@@ -0,0 +1,18 @@
+#include "syscall.h"
+
+#if defined(__ia64__)
+/* based on code from glibc by Jes Sorensen */
+__asm__(".text\n"
+       ".globl __ia64_aio_raw_syscall\n"
+       "__ia64_aio_raw_syscall:\n"
+       "alloc r2=ar.pfs,1,0,8,0\n"
+       "mov r15=r32\n"
+       "break 0x100000\n"
+       ";;"
+       "br.ret.sptk.few b0\n"
+       ".size __ia64_aio_raw_syscall, . - __ia64_aio_raw_syscall\n"
+       ".endp __ia64_aio_raw_syscall"
+);
+#endif
+
+;
diff --git a/tools/libaio/src/syscall-alpha.h b/tools/libaio/src/syscall-alpha.h
new file mode 100644 (file)
index 0000000..467b74f
--- /dev/null
@@ -0,0 +1,209 @@
+#define __NR_io_setup          398
+#define __NR_io_destroy                399
+#define __NR_io_getevents      400
+#define __NR_io_submit         401
+#define __NR_io_cancel         402
+
+#define inline_syscall_r0_asm
+#define inline_syscall_r0_out_constraint        "=v"
+
+#define inline_syscall_clobbers                    \
+   "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", \
+   "$22", "$23", "$24", "$25", "$27", "$28", "memory"
+
+#define inline_syscall0(name, args...)                          \
+{                                                               \
+        register long _sc_0 inline_syscall_r0_asm;              \
+        register long _sc_19 __asm__("$19");                    \
+                                                                \
+        _sc_0 = name;                                           \
+        __asm__ __volatile__                                    \
+          ("callsys # %0 %1 <= %2"                              \
+          : inline_syscall_r0_out_constraint (_sc_0),          \
+             "=r"(_sc_19)                                       \
+          : "0"(_sc_0)                                         \
+          : inline_syscall_clobbers,                           \
+             "$16", "$17", "$18", "$20", "$21");                \
+        _sc_ret = _sc_0, _sc_err = _sc_19;                      \
+}
+
+#define inline_syscall1(name,arg1)                              \
+{                                                               \
+        register long _sc_0 inline_syscall_r0_asm;              \
+        register long _sc_16 __asm__("$16");                    \
+        register long _sc_19 __asm__("$19");                    \
+                                                                \
+        _sc_0 = name;                                           \
+        _sc_16 = (long) (arg1);                                 \
+        __asm__ __volatile__                                    \
+          ("callsys # %0 %1 <= %2 %3"                           \
+          : inline_syscall_r0_out_constraint (_sc_0),          \
+             "=r"(_sc_19), "=r"(_sc_16)                         \
+          : "0"(_sc_0), "2"(_sc_16)                            \
+          : inline_syscall_clobbers,                           \
+             "$17", "$18", "$20", "$21");                       \
+        _sc_ret = _sc_0, _sc_err = _sc_19;                      \
+}
+
+#define inline_syscall2(name,arg1,arg2)                         \
+{                                                               \
+        register long _sc_0 inline_syscall_r0_asm;              \
+        register long _sc_16 __asm__("$16");                    \
+        register long _sc_17 __asm__("$17");                    \
+        register long _sc_19 __asm__("$19");                    \
+                                                                \
+        _sc_0 = name;                                           \
+        _sc_16 = (long) (arg1);                                 \
+        _sc_17 = (long) (arg2);                                 \
+        __asm__ __volatile__                                    \
+          ("callsys # %0 %1 <= %2 %3 %4"                        \
+          : inline_syscall_r0_out_constraint (_sc_0),          \
+             "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17)           \
+          : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17)               \
+          : inline_syscall_clobbers,                           \
+             "$18", "$20", "$21");                              \
+        _sc_ret = _sc_0, _sc_err = _sc_19;                      \
+}
+
+#define inline_syscall3(name,arg1,arg2,arg3)                    \
+{                                                               \
+        register long _sc_0 inline_syscall_r0_asm;              \
+        register long _sc_16 __asm__("$16");                    \
+        register long _sc_17 __asm__("$17");                    \
+        register long _sc_18 __asm__("$18");                    \
+        register long _sc_19 __asm__("$19");                    \
+                                                                \
+        _sc_0 = name;                                           \
+        _sc_16 = (long) (arg1);                                 \
+        _sc_17 = (long) (arg2);                                 \
+        _sc_18 = (long) (arg3);                                 \
+        __asm__ __volatile__                                    \
+          ("callsys # %0 %1 <= %2 %3 %4 %5"                     \
+          : inline_syscall_r0_out_constraint (_sc_0),          \
+             "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17),          \
+             "=r"(_sc_18)                                       \
+          : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17),              \
+             "4"(_sc_18)                                        \
+          : inline_syscall_clobbers, "$20", "$21");            \
+        _sc_ret = _sc_0, _sc_err = _sc_19;                      \
+}
+
+#define inline_syscall4(name,arg1,arg2,arg3,arg4)               \
+{                                                               \
+        register long _sc_0 inline_syscall_r0_asm;              \
+        register long _sc_16 __asm__("$16");                    \
+        register long _sc_17 __asm__("$17");                    \
+        register long _sc_18 __asm__("$18");                    \
+        register long _sc_19 __asm__("$19");                    \
+                                                                \
+        _sc_0 = name;                                           \
+        _sc_16 = (long) (arg1);                                 \
+        _sc_17 = (long) (arg2);                                 \
+        _sc_18 = (long) (arg3);                                 \
+        _sc_19 = (long) (arg4);                                 \
+        __asm__ __volatile__                                    \
+          ("callsys # %0 %1 <= %2 %3 %4 %5 %6"                  \
+          : inline_syscall_r0_out_constraint (_sc_0),          \
+             "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17),          \
+             "=r"(_sc_18)                                       \
+          : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17),              \
+             "4"(_sc_18), "1"(_sc_19)                           \
+          : inline_syscall_clobbers, "$20", "$21");            \
+        _sc_ret = _sc_0, _sc_err = _sc_19;                      \
+}
+
+#define inline_syscall5(name,arg1,arg2,arg3,arg4,arg5)          \
+{                                                               \
+        register long _sc_0 inline_syscall_r0_asm;              \
+        register long _sc_16 __asm__("$16");                    \
+        register long _sc_17 __asm__("$17");                    \
+        register long _sc_18 __asm__("$18");                    \
+        register long _sc_19 __asm__("$19");                    \
+        register long _sc_20 __asm__("$20");                    \
+                                                                \
+        _sc_0 = name;                                           \
+        _sc_16 = (long) (arg1);                                 \
+        _sc_17 = (long) (arg2);                                 \
+        _sc_18 = (long) (arg3);                                 \
+        _sc_19 = (long) (arg4);                                 \
+        _sc_20 = (long) (arg5);                                 \
+        __asm__ __volatile__                                    \
+          ("callsys # %0 %1 <= %2 %3 %4 %5 %6 %7"               \
+          : inline_syscall_r0_out_constraint (_sc_0),          \
+             "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17),          \
+             "=r"(_sc_18), "=r"(_sc_20)                         \
+          : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17),              \
+             "4"(_sc_18), "1"(_sc_19), "5"(_sc_20)              \
+          : inline_syscall_clobbers, "$21");                   \
+        _sc_ret = _sc_0, _sc_err = _sc_19;                      \
+}
+
+#define inline_syscall6(name,arg1,arg2,arg3,arg4,arg5,arg6)     \
+{                                                               \
+        register long _sc_0 inline_syscall_r0_asm;              \
+        register long _sc_16 __asm__("$16");                    \
+        register long _sc_17 __asm__("$17");                    \
+        register long _sc_18 __asm__("$18");                    \
+        register long _sc_19 __asm__("$19");                    \
+        register long _sc_20 __asm__("$20");                    \
+        register long _sc_21 __asm__("$21");                    \
+                                                                \
+        _sc_0 = name;                                           \
+        _sc_16 = (long) (arg1);                                 \
+        _sc_17 = (long) (arg2);                                 \
+        _sc_18 = (long) (arg3);                                 \
+        _sc_19 = (long) (arg4);                                 \
+        _sc_20 = (long) (arg5);                                 \
+        _sc_21 = (long) (arg6);                                 \
+        __asm__ __volatile__                                    \
+          ("callsys # %0 %1 <= %2 %3 %4 %5 %6 %7 %8"            \
+          : inline_syscall_r0_out_constraint (_sc_0),          \
+             "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17),          \
+             "=r"(_sc_18), "=r"(_sc_20), "=r"(_sc_21)           \
+          : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17), "4"(_sc_18), \
+             "1"(_sc_19), "5"(_sc_20), "6"(_sc_21)              \
+          : inline_syscall_clobbers);                          \
+        _sc_ret = _sc_0, _sc_err = _sc_19;                      \
+}
+
+#define INLINE_SYSCALL1(name, nr, args...)      \
+({                                              \
+        long _sc_ret, _sc_err;                  \
+        inline_syscall##nr(__NR_##name, args);  \
+        if (_sc_err != 0)                       \
+        {                                       \
+            _sc_ret = -(_sc_ret);               \
+        }                                       \
+        _sc_ret;                                \
+})
+
+#define io_syscall1(type,fname,sname,type1,arg1)                       \
+type fname(type1 arg1)                                                 \
+{                                                                       \
+   return (type)INLINE_SYSCALL1(sname, 1, arg1);                        \
+}
+
+#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2)            \
+type fname(type1 arg1,type2 arg2)                                      \
+{                                                                      \
+   return (type)INLINE_SYSCALL1(sname, 2, arg1, arg2);                  \
+}
+
+#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,type3,arg3) \
+type fname(type1 arg1,type2 arg2,type3 arg3)                           \
+{                                                                      \
+   return (type)INLINE_SYSCALL1(sname, 3, arg1, arg2, arg3);            \
+}
+
+#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+type fname (type1 arg1, type2 arg2, type3 arg3, type4 arg4)            \
+{                                                                      \
+   return (type)INLINE_SYSCALL1(sname, 4, arg1, arg2, arg3, arg4);      \
+}
+
+#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+         type5,arg5)                                                   \
+type fname (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5)    \
+{                                                                      \
+   return (type)INLINE_SYSCALL1(sname, 5, arg1, arg2, arg3, arg4, arg5);\
+}
diff --git a/tools/libaio/src/syscall-i386.h b/tools/libaio/src/syscall-i386.h
new file mode 100644 (file)
index 0000000..9576975
--- /dev/null
@@ -0,0 +1,72 @@
+#define __NR_io_setup          245
+#define __NR_io_destroy                246
+#define __NR_io_getevents      247
+#define __NR_io_submit         248
+#define __NR_io_cancel         249
+
+#define io_syscall1(type,fname,sname,type1,arg1)       \
+type fname(type1 arg1)                                 \
+{                                                      \
+long __res;                                            \
+__asm__ volatile ("xchgl %%edi,%%ebx\n"                        \
+                 "int $0x80\n"                         \
+                 "xchgl %%edi,%%ebx"                   \
+       : "=a" (__res)                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)));       \
+return __res;                                          \
+}
+
+#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2)            \
+type fname(type1 arg1,type2 arg2)                                      \
+{                                                                      \
+long __res;                                                            \
+__asm__ volatile ("xchgl %%edi,%%ebx\n"                                        \
+                 "int $0x80\n"                                         \
+                 "xchgl %%edi,%%ebx"                                   \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)),"c" ((long)(arg2)));    \
+return __res;                                                          \
+}
+
+#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,type3,arg3) \
+type fname(type1 arg1,type2 arg2,type3 arg3)                           \
+{                                                                      \
+long __res;                                                            \
+__asm__ volatile ("xchgl %%edi,%%ebx\n"                                        \
+                 "int $0x80\n"                                         \
+                 "xchgl %%edi,%%ebx"                                   \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)),"c" ((long)(arg2)),     \
+                 "d" ((long)(arg3)));                                  \
+return __res;                                                          \
+}
+
+#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+type fname (type1 arg1, type2 arg2, type3 arg3, type4 arg4)            \
+{                                                                      \
+long __res;                                                            \
+__asm__ volatile ("xchgl %%edi,%%ebx\n"                                        \
+                 "int $0x80\n"                                         \
+                 "xchgl %%edi,%%ebx"                                   \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)),"c" ((long)(arg2)),     \
+         "d" ((long)(arg3)),"S" ((long)(arg4)));                       \
+return __res;                                                          \
+} 
+
+#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+         type5,arg5)                                                   \
+type fname (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5)    \
+{                                                                      \
+long __res;                                                            \
+long tmp;                                                              \
+__asm__ volatile ("movl %%ebx,%7\n"                                    \
+                 "movl %2,%%ebx\n"                                     \
+                 "int $0x80\n"                                         \
+                 "movl %7,%%ebx"                                       \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"rm" ((long)(arg1)),"c" ((long)(arg2)),    \
+         "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)), \
+         "m" (tmp));                                                   \
+return __res;                                                          \
+}
diff --git a/tools/libaio/src/syscall-ia64.h b/tools/libaio/src/syscall-ia64.h
new file mode 100644 (file)
index 0000000..2f6a01a
--- /dev/null
@@ -0,0 +1,44 @@
+#define __NR_io_setup          1238
+#define __NR_io_destroy                1239
+#define __NR_io_getevents      1240
+#define __NR_io_submit         1241
+#define __NR_io_cancel         1242
+
+#define __ia64_raw_syscall(fname, sname) \
+       __asm__ (".text\n"                                              \
+               ".globl " SYMSTR(fname) "\n"                            \
+               SYMSTR(fname) ":\n"                                     \
+               "       mov r15=" SYMSTR( __NR_ ## sname ) "\n"         \
+               "       break 0x100000\n"                               \
+               "       ;;\n"                                           \
+               "       cmp.eq p6,p0=-1,r10\n"                          \
+               "       ;;\n"                                           \
+               "       (p6) sub r8=0,r8\n"                             \
+               "       br.ret.sptk.few b0\n"                           \
+               ".size " SYMSTR(fname) ", . - " SYMSTR(fname) "\n"      \
+               ".endp " SYMSTR(fname) "\n"                             \
+       );
+
+#define io_syscall0(type, name)                                                \
+       extern type name(void);                                         \
+       __ia64_raw_syscall(name);
+
+#define io_syscall1(type, fname, sname, type1, arg1)                   \
+       extern type fname(type1 arg1);                                  \
+       __ia64_raw_syscall(fname, sname);
+
+#define io_syscall2(type, fname, sname, type1, arg1, type2, arg2)      \
+       extern type fname(type1 arg1, type2 arg2);                      \
+       __ia64_raw_syscall(fname, sname);
+
+#define io_syscall3(type, fname, sname, type1, arg1, type2, arg2, type3, arg3) \
+       extern type fname(type1 arg1, type2 arg2, type3 arg3);          \
+       __ia64_raw_syscall(fname, sname);
+
+#define io_syscall4(type, fname, sname, type1, arg1, type2, arg2, type3, arg3, type4, arg4)    \
+       extern type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4);              \
+       __ia64_raw_syscall(fname, sname);
+
+#define io_syscall5(type, fname, sname, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5)       \
+       extern type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5);                  \
+       __ia64_raw_syscall(fname, sname);
diff --git a/tools/libaio/src/syscall-ppc.h b/tools/libaio/src/syscall-ppc.h
new file mode 100644 (file)
index 0000000..ca70dd2
--- /dev/null
@@ -0,0 +1,94 @@
+#define __NR_io_setup          227
+#define __NR_io_destroy                228
+#define __NR_io_getevents      229
+#define __NR_io_submit         230
+#define __NR_io_cancel         231
+
+/* On powerpc a system call basically clobbers the same registers like a
+ * function call, with the exception of LR (which is needed for the
+ * "sc; bnslr" sequence) and CR (where only CR0.SO is clobbered to signal
+ * an error return status).
+ */
+
+#define __syscall_nr(nr, type, name, args...)                          \
+       unsigned long __sc_ret, __sc_err;                               \
+       {                                                               \
+               register unsigned long __sc_0  __asm__ ("r0");          \
+               register unsigned long __sc_3  __asm__ ("r3");          \
+               register unsigned long __sc_4  __asm__ ("r4");          \
+               register unsigned long __sc_5  __asm__ ("r5");          \
+               register unsigned long __sc_6  __asm__ ("r6");          \
+               register unsigned long __sc_7  __asm__ ("r7");          \
+               register unsigned long __sc_8  __asm__ ("r8");          \
+                                                                       \
+               __sc_loadargs_##nr(name, args);                         \
+               __asm__ __volatile__                                    \
+                       ("sc           \n\t"                            \
+                        "mfcr %0      "                                \
+                       : "=&r" (__sc_0),                               \
+                         "=&r" (__sc_3),  "=&r" (__sc_4),              \
+                         "=&r" (__sc_5),  "=&r" (__sc_6),              \
+                         "=&r" (__sc_7),  "=&r" (__sc_8)               \
+                       : __sc_asm_input_##nr                           \
+                       : "cr0", "ctr", "memory",                       \
+                               "r9", "r10","r11", "r12");              \
+               __sc_ret = __sc_3;                                      \
+               __sc_err = __sc_0;                                      \
+       }                                                               \
+       if (__sc_err & 0x10000000) return -((int)__sc_ret);             \
+       return (type) __sc_ret
+
+#define __sc_loadargs_0(name, dummy...)                                        \
+       __sc_0 = __NR_##name
+#define __sc_loadargs_1(name, arg1)                                    \
+       __sc_loadargs_0(name);                                          \
+       __sc_3 = (unsigned long) (arg1)
+#define __sc_loadargs_2(name, arg1, arg2)                              \
+       __sc_loadargs_1(name, arg1);                                    \
+       __sc_4 = (unsigned long) (arg2)
+#define __sc_loadargs_3(name, arg1, arg2, arg3)                                \
+       __sc_loadargs_2(name, arg1, arg2);                              \
+       __sc_5 = (unsigned long) (arg3)
+#define __sc_loadargs_4(name, arg1, arg2, arg3, arg4)                  \
+       __sc_loadargs_3(name, arg1, arg2, arg3);                        \
+       __sc_6 = (unsigned long) (arg4)
+#define __sc_loadargs_5(name, arg1, arg2, arg3, arg4, arg5)            \
+       __sc_loadargs_4(name, arg1, arg2, arg3, arg4);                  \
+       __sc_7 = (unsigned long) (arg5)
+
+#define __sc_asm_input_0 "0" (__sc_0)
+#define __sc_asm_input_1 __sc_asm_input_0, "1" (__sc_3)
+#define __sc_asm_input_2 __sc_asm_input_1, "2" (__sc_4)
+#define __sc_asm_input_3 __sc_asm_input_2, "3" (__sc_5)
+#define __sc_asm_input_4 __sc_asm_input_3, "4" (__sc_6)
+#define __sc_asm_input_5 __sc_asm_input_4, "5" (__sc_7)
+
+#define io_syscall1(type,fname,sname,type1,arg1)                               \
+type fname(type1 arg1)                                                 \
+{                                                                      \
+       __syscall_nr(1, type, sname, arg1);                             \
+}
+
+#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2)            \
+type fname(type1 arg1, type2 arg2)                                     \
+{                                                                      \
+       __syscall_nr(2, type, sname, arg1, arg2);                       \
+}
+
+#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,type3,arg3) \
+type fname(type1 arg1, type2 arg2, type3 arg3)                         \
+{                                                                      \
+       __syscall_nr(3, type, sname, arg1, arg2, arg3);                 \
+}
+
+#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4)             \
+{                                                                      \
+       __syscall_nr(4, type, sname, arg1, arg2, arg3, arg4);           \
+}
+
+#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \
+type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
+{                                                                      \
+       __syscall_nr(5, type, sname, arg1, arg2, arg3, arg4, arg5);     \
+}
diff --git a/tools/libaio/src/syscall-s390.h b/tools/libaio/src/syscall-s390.h
new file mode 100644 (file)
index 0000000..3ec5ee3
--- /dev/null
@@ -0,0 +1,131 @@
+#define __NR_io_setup          243
+#define __NR_io_destroy                244
+#define __NR_io_getevents      245
+#define __NR_io_submit         246
+#define __NR_io_cancel         247
+
+#define io_svc_clobber "1", "cc", "memory"
+
+#define io_syscall1(type,fname,sname,type1,arg1)               \
+type fname(type1 arg1) {                                       \
+       register type1 __arg1 asm("2") = arg1;                  \
+       register long __svcres asm("2");                        \
+       long __res;                                             \
+       __asm__ __volatile__ (                                  \
+               "    .if %1 < 256\n"                            \
+               "    svc %b1\n"                                 \
+               "    .else\n"                                   \
+               "    la  %%r1,%1\n"                             \
+               "    .svc 0\n"                                  \
+               "    .endif"                                    \
+               : "=d" (__svcres)                               \
+               : "i" (__NR_##sname),                           \
+                 "0" (__arg1)                                  \
+               : io_svc_clobber );                             \
+       __res = __svcres;                                       \
+       return (type) __res;                                    \
+}
+
+#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2)    \
+type fname(type1 arg1, type2 arg2) {                           \
+       register type1 __arg1 asm("2") = arg1;                  \
+       register type2 __arg2 asm("3") = arg2;                  \
+       register long __svcres asm("2");                        \
+       long __res;                                             \
+       __asm__ __volatile__ (                                  \
+               "    .if %1 < 256\n"                            \
+               "    svc %b1\n"                                 \
+               "    .else\n"                                   \
+               "    la %%r1,%1\n"                              \
+               "    svc 0\n"                                   \
+               "    .endif"                                    \
+               : "=d" (__svcres)                               \
+               : "i" (__NR_##sname),                           \
+                 "0" (__arg1),                                 \
+                 "d" (__arg2)                                  \
+               : io_svc_clobber );                             \
+       __res = __svcres;                                       \
+       return (type) __res;                                    \
+}
+
+#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,    \
+                   type3,arg3)                                 \
+type fname(type1 arg1, type2 arg2, type3 arg3) {               \
+       register type1 __arg1 asm("2") = arg1;                  \
+       register type2 __arg2 asm("3") = arg2;                  \
+       register type3 __arg3 asm("4") = arg3;                  \
+       register long __svcres asm("2");                        \
+       long __res;                                             \
+       __asm__ __volatile__ (                                  \
+               "    .if %1 < 256\n"                            \
+               "    svc %b1\n"                                 \
+               "    .else\n"                                   \
+               "    la  %%r1,%1\n"                             \
+               "    svc 0\n"                                   \
+               "    .endif"                                    \
+               : "=d" (__svcres)                               \
+               : "i" (__NR_##sname),                           \
+                 "0" (__arg1),                                 \
+                 "d" (__arg2),                                 \
+                 "d" (__arg3)                                  \
+               : io_svc_clobber );                             \
+       __res = __svcres;                                       \
+       return (type) __res;                                    \
+}
+
+#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,    \
+                   type3,arg3,type4,arg4)                      \
+type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
+       register type1 __arg1 asm("2") = arg1;                  \
+       register type2 __arg2 asm("3") = arg2;                  \
+       register type3 __arg3 asm("4") = arg3;                  \
+       register type4 __arg4 asm("5") = arg4;                  \
+       register long __svcres asm("2");                        \
+       long __res;                                             \
+       __asm__ __volatile__ (                                  \
+               "    .if %1 < 256\n"                            \
+               "    svc %b1\n"                                 \
+               "    .else\n"                                   \
+               "    la  %%r1,%1\n"                             \
+               "    svc 0\n"                                   \
+               "    .endif"                                    \
+               : "=d" (__svcres)                               \
+               : "i" (__NR_##sname),                           \
+                 "0" (__arg1),                                 \
+                 "d" (__arg2),                                 \
+                 "d" (__arg3),                                 \
+                 "d" (__arg4)                                  \
+               : io_svc_clobber );                             \
+       __res = __svcres;                                       \
+       return (type) __res;                                    \
+}
+
+#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,    \
+                   type3,arg3,type4,arg4,type5,arg5)           \
+type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+          type5 arg5) {                                        \
+       register type1 __arg1 asm("2") = arg1;                  \
+       register type2 __arg2 asm("3") = arg2;                  \
+       register type3 __arg3 asm("4") = arg3;                  \
+       register type4 __arg4 asm("5") = arg4;                  \
+       register type5 __arg5 asm("6") = arg5;                  \
+       register long __svcres asm("2");                        \
+       long __res;                                             \
+       __asm__ __volatile__ (                                  \
+               "    .if %1 < 256\n"                            \
+               "    svc %b1\n"                                 \
+               "    .else\n"                                   \
+               "    la  %%r1,%1\n"                             \
+               "    svc 0\n"                                   \
+               "    .endif"                                    \
+               : "=d" (__svcres)                               \
+               : "i" (__NR_##sname),                           \
+                 "0" (__arg1),                                 \
+                 "d" (__arg2),                                 \
+                 "d" (__arg3),                                 \
+                 "d" (__arg4),                                 \
+                 "d" (__arg5)                                  \
+               : io_svc_clobber );                             \
+       __res = __svcres;                                       \
+       return (type) __res;                                    \
+}
diff --git a/tools/libaio/src/syscall-x86_64.h b/tools/libaio/src/syscall-x86_64.h
new file mode 100644 (file)
index 0000000..9361856
--- /dev/null
@@ -0,0 +1,63 @@
+#define __NR_io_setup          206
+#define __NR_io_destroy                207
+#define __NR_io_getevents      208
+#define __NR_io_submit         209
+#define __NR_io_cancel         210
+
+#define __syscall_clobber "r11","rcx","memory" 
+#define __syscall "syscall"
+
+#define io_syscall1(type,fname,sname,type1,arg1)                       \
+type fname(type1 arg1)                                                 \
+{                                                                      \
+long __res;                                                            \
+__asm__ volatile (__syscall                                            \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)) : __syscall_clobber );  \
+return __res;                                                          \
+}
+
+#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2)            \
+type fname(type1 arg1,type2 arg2)                                      \
+{                                                                      \
+long __res;                                                            \
+__asm__ volatile (__syscall                                            \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)),"S" ((long)(arg2)) : __syscall_clobber ); \
+return __res;                                                          \
+}
+
+#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,type3,arg3) \
+type fname(type1 arg1,type2 arg2,type3 arg3)                           \
+{                                                                      \
+long __res;                                                            \
+__asm__ volatile (__syscall                                            \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)),"S" ((long)(arg2)),     \
+                 "d" ((long)(arg3)) : __syscall_clobber);              \
+return __res;                                                          \
+}
+
+#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+type fname (type1 arg1, type2 arg2, type3 arg3, type4 arg4)            \
+{                                                                      \
+long __res;                                                            \
+__asm__ volatile ("movq %5,%%r10 ;" __syscall                          \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)),"S" ((long)(arg2)),     \
+         "d" ((long)(arg3)),"g" ((long)(arg4)) : __syscall_clobber,"r10" ); \
+return __res;                                                          \
+} 
+
+#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+         type5,arg5)                                                   \
+type fname (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5)    \
+{                                                                      \
+long __res;                                                            \
+__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall          \
+       : "=a" (__res)                                                  \
+       : "0" (__NR_##sname),"D" ((long)(arg1)),"S" ((long)(arg2)),     \
+         "d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5)) :    \
+       __syscall_clobber,"r8","r10" );                                 \
+return __res;                                                          \
+}
diff --git a/tools/libaio/src/syscall.h b/tools/libaio/src/syscall.h
new file mode 100644 (file)
index 0000000..0283825
--- /dev/null
@@ -0,0 +1,27 @@
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#define _SYMSTR(str)   #str
+#define SYMSTR(str)    _SYMSTR(str)
+
+#define SYMVER(compat_sym, orig_sym, ver_sym)  \
+       __asm__(".symver " SYMSTR(compat_sym) "," SYMSTR(orig_sym) "@LIBAIO_" SYMSTR(ver_sym));
+
+#define DEFSYMVER(compat_sym, orig_sym, ver_sym)       \
+       __asm__(".symver " SYMSTR(compat_sym) "," SYMSTR(orig_sym) "@@LIBAIO_" SYMSTR(ver_sym));
+
+#if defined(__i386__)
+#include "syscall-i386.h"
+#elif defined(__x86_64__)
+#include "syscall-x86_64.h"
+#elif defined(__ia64__)
+#include "syscall-ia64.h"
+#elif defined(__PPC__)
+#include "syscall-ppc.h"
+#elif defined(__s390__)
+#include "syscall-s390.h"
+#elif defined(__alpha__)
+#include "syscall-alpha.h"
+#else
+#error "add syscall-arch.h"
+#endif
diff --git a/tools/libaio/src/vsys_def.h b/tools/libaio/src/vsys_def.h
new file mode 100644 (file)
index 0000000..13d032e
--- /dev/null
@@ -0,0 +1,24 @@
+/* libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+extern int vsys_io_setup(unsigned nr_reqs, io_context_t *ctxp);
+extern int vsys_io_destroy(io_context_t ctx);
+extern int vsys_io_submit(io_context_t ctx, long nr, struct iocb *iocbs[]);
+extern int vsys_io_cancel(io_context_t ctx, struct iocb *iocb);
+extern int vsys_io_wait(io_context_t ctx, struct iocb *iocb, const struct timespec *when);
+extern int vsys_io_getevents(io_context_t ctx_id, long nr, struct io_event *events, const struct timespec *timeout);
+
index cd354380907e6ccd4a2e9ccfc64f712daf7c22e8..e9bd8e18c7d71f78b2cb4822987531c17101494a 100644 (file)
@@ -92,6 +92,10 @@ def start_xenstored():
 def start_consoled():
     if os.fork() == 0:
         os.execvp('xenconsoled', ['xenconsoled'])
+
+def start_blktapctrl():
+    if os.fork() == 0:
+        os.execvp('blktapctrl', ['blktapctrl'])
             
 def main():
     try:
@@ -106,16 +110,19 @@ def main():
     elif sys.argv[1] == 'start':
         start_xenstored()
         start_consoled()
+       start_blktapctrl()
         return daemon.start()
     elif sys.argv[1] == 'trace_start':
         start_xenstored()
         start_consoled()
+        start_blktapctrl()
         return daemon.start(trace=1)
     elif sys.argv[1] == 'stop':
         return daemon.stop()
     elif sys.argv[1] == 'restart':
         start_xenstored()
         start_consoled()
+        start_blktapctrl()
         return daemon.stop() or daemon.start()
     elif sys.argv[1] == 'status':
         return daemon.status()
index e9b21c7ce5b82ca71890fc5d6324f3423aef9246..14f9f4311ab51e9a085384887f1e3f984d65664e 100644 (file)
@@ -1701,6 +1701,7 @@ def addControllerClass(device_class, cls):
 
 
 from xen.xend.server import blkif, netif, tpmif, pciif, iopif, irqif, usbif
+from xen.xend.server.BlktapController import BlktapController
 addControllerClass('vbd',  blkif.BlkifController)
 addControllerClass('vif',  netif.NetifController)
 addControllerClass('vtpm', tpmif.TPMifController)
@@ -1708,3 +1709,4 @@ addControllerClass('pci',  pciif.PciController)
 addControllerClass('ioports', iopif.IOPortsController)
 addControllerClass('irq',  irqif.IRQController)
 addControllerClass('usb',  usbif.UsbifController)
+addControllerClass('tap',  BlktapController)
diff --git a/tools/python/xen/xend/server/BlktapController.py b/tools/python/xen/xend/server/BlktapController.py
new file mode 100644 (file)
index 0000000..062769a
--- /dev/null
@@ -0,0 +1,14 @@
+# Copyright (c) 2005, XenSource Ltd.
+
+
+from xen.xend.server.blkif import BlkifController
+
+
+class BlktapController(BlkifController):
+    def __init__(self, vm):
+        BlkifController.__init__(self, vm)
+        
+    def frontendRoot(self):
+        """@see DevController#frontendRoot"""
+        
+        return "%s/device/vbd" % self.vm.getDomainPath()
index e5d02734651406df3386f4ff8fe2e93d019c74da..4af00f458daf3e18340bb792c0fb388daf9802cf 100644 (file)
@@ -479,7 +479,13 @@ def configure_disks(config_devs, vals):
     """Create the config for disks (virtual block devices).
     """
     for (uname, dev, mode, backend) in vals.disk:
-        config_vbd = ['vbd',
+
+        if uname.startswith('tap:'):
+            cls = 'tap'
+        else:
+            cls = 'vbd'
+
+        config_vbd = [cls,
                       ['uname', uname],
                       ['dev', dev ],
                       ['mode', mode ] ]
index 791c18eacd9c79d5aa37cab29ad13f3b397b141c..f34ad0947e83e52e03f63efa6c6dd263ede5d61a 100644 (file)
@@ -994,7 +994,13 @@ def xm_block_attach(args):
     arg_check(args, 'block-attach', 4, 5)
 
     dom = args[0]
-    vbd = ['vbd',
+
+    if args[1].startswith('tap:'):
+        cls = 'tap'
+    else:
+        cls = 'vbd'
+        
+    vbd = [cls,
            ['uname', args[1]],
            ['dev',   args[2]],
            ['mode',  args[3]]]
index d6b143e1c613646bd8cec2911c1590ab9e8c0e99..c8a6a483d876e6bea53b710149abecb12af200c5 100644 (file)
@@ -35,7 +35,7 @@ XENSTORED_Linux = xenstored_linux.o
 XENSTORED_OBJS += $(XENSTORED_$(OS))
 
 .PHONY: all
-all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-control xenstore-ls
+all: libxenstore.so libxenstore.a xenstored $(CLIENTS) xs_tdb_dump xenstore-control xenstore-ls
 
 test_interleaved_transactions: test_interleaved_transactions.o
        $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
@@ -90,6 +90,9 @@ talloc_test.o: talloc.c
 libxenstore.so: xs.opic xs_lib.opic
        $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-soname -Wl,libxenstore.so -shared -o $@ $^ -lpthread
 
+libxenstore.a: libxenstore.so
+       ar rcs libxenstore.a $^
+
 .PHONY: clean
 clean: testsuite-clean
        rm -f *.o *.opic *.so
@@ -172,7 +175,7 @@ install: all
        $(INSTALL_PROG) xenstore-control $(DESTDIR)/usr/bin
        $(INSTALL_PROG) xenstore-ls $(DESTDIR)/usr/bin
        $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR)
-       $(INSTALL_LIBS) libxenstore.so $(DESTDIR)/usr/$(LIBDIR)
+       $(INSTALL_DATA) libxenstore.* $(DESTDIR)/usr/$(LIBDIR)
        $(INSTALL_DATA) xs.h $(DESTDIR)/usr/include
        $(INSTALL_DATA) xs_lib.h $(DESTDIR)/usr/include
 
index d951c9388f5f591845dfab17f85f1bd1943ac9e3..a89ec7e4edfb429e7a5bb4607fe93e1b8e6a3b0e 100644 (file)
@@ -110,8 +110,7 @@ __gnttab_map_grant_ref(
         return;
     }
 
-    if ( unlikely((rd = find_domain_by_id(op->dom)) == NULL) ||
-         unlikely(ld == rd) )
+    if ( unlikely((rd = find_domain_by_id(op->dom)) == NULL) )
     {
         if ( rd != NULL )
             put_domain(rd);
@@ -350,8 +349,7 @@ __gnttab_unmap_grant_ref(
     ref   = map->ref;
     flags = map->flags;
 
-    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
-         unlikely(ld == rd) )
+    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) )
     {
         if ( rd != NULL )
             put_domain(rd);